From 0a30abdf3fb8ac84474c8ca34134eca6786dc5d2 Mon Sep 17 00:00:00 2001 From: Dmitry Ilyin Date: Wed, 5 Feb 2020 17:59:12 +0300 Subject: [PATCH] Stanalone native lib build CMake --- .gitignore | 3 + CMakeLists.txt | 99 + alloc_pool.cc | 100 + arm/.deps/libde265_arm_la-arm.Plo | 1 + arm/.deps/libde265_arm_neon_la-asm.Plo | 1 + arm/.deps/libde265_arm_neon_la-cpudetect.Plo | 1 + ...libde265_arm_neon_la-hevcdsp_qpel_neon.Plo | 1 + arm/.deps/libde265_arm_neon_la-neon.Plo | 1 + arm/Makefile | 770 ++ arm/Makefile.am | 38 + arm/Makefile.in | 770 ++ arm/arm.cc | 123 + arm/arm.h | 28 + arm/asm.S | 325 + arm/cpudetect.S | 29 + arm/hevcdsp_qpel_neon.S | 1004 +++ arm/neon.S | 59 + bitstream.cc | 176 + cabac.cc | 1033 +++ configparam.cc | 491 ++ contextmodel.cc | 347 + de265.cc | 711 ++ deblock.cc | 1058 +++ decctx.cc | 2285 ++++++ dpb.cc | 296 + fallback-dct.cc | 1210 +++ fallback-motion.cc | 648 ++ fallback.cc | 127 + image-io.cc | 220 + image.cc | 817 ++ intrapred.cc | 364 + libde265/acceleration.h | 359 + libde265/alloc_pool.h | 61 + libde265/bitstream.h | 63 + libde265/cabac.h | 211 + libde265/configparam.h | 401 + libde265/contextmodel.h | 130 + libde265/de265-version.h | 36 + libde265/de265.h | 437 + libde265/deblock.h | 29 + libde265/decctx.h | 528 ++ libde265/dpb.h | 118 + libde265/en265.h | 218 + libde265/fallback-dct.h | 96 + libde265/fallback-motion.h | 104 + libde265/fallback.h | 28 + libde265/image-io.h | 121 + libde265/image.h | 864 ++ libde265/intrapred.h | 678 ++ libde265/md5.h | 45 + libde265/motion.h | 131 + libde265/nal-parser.h | 154 + libde265/nal.h | 129 + libde265/pps.h | 163 + libde265/quality.h | 47 + libde265/refpic.h | 61 + libde265/sao.h | 36 + libde265/scan.h | 43 + libde265/sei.h | 89 + libde265/slice.h | 313 + libde265/sps.h | 257 + libde265/threads.h | 148 + libde265/transform.h | 65 + libde265/util.h | 229 + libde265/visualize.h | 50 + libde265/vps.h | 173 + libde265/vui.h | 126 + md5.cc | 295 + motion.cc | 2111 +++++ nal-parser.cc | 446 ++ nal.cc | 166 + pps.cc | 992 +++ quality.cc | 111 + refpic.cc | 434 + sao.cc | 524 ++ scan.cc | 152 + sei.cc | 501 ++ slice.cc | 5072 ++++++++++++ sps.cc | 1298 +++ threads.cc | 312 + transform.cc | 739 ++ util.cc | 247 + visualize.cc | 562 ++ vps.cc | 602 ++ vui.cc | 425 + x86/.deps/libde265_x86_la-sse.Plo | 64 + x86/.deps/libde265_x86_sse_la-sse-dct.Plo | 431 + x86/.deps/libde265_x86_sse_la-sse-motion.Plo | 432 + x86/CMakeLists.txt | 23 + x86/Makefile | 703 ++ x86/Makefile.am | 22 + x86/Makefile.in | 703 ++ x86/sse-dct.cc | 7094 +++++++++++++++++ x86/sse-dct.h | 35 + x86/sse-motion.cc | 4971 ++++++++++++ x86/sse-motion.h | 104 + x86/sse.cc | 104 + x86/sse.h | 28 + 98 files changed, 49580 insertions(+) create mode 100644 .gitignore create mode 100644 CMakeLists.txt create mode 100644 alloc_pool.cc create mode 100644 arm/.deps/libde265_arm_la-arm.Plo create mode 100644 arm/.deps/libde265_arm_neon_la-asm.Plo create mode 100644 arm/.deps/libde265_arm_neon_la-cpudetect.Plo create mode 100644 arm/.deps/libde265_arm_neon_la-hevcdsp_qpel_neon.Plo create mode 100644 arm/.deps/libde265_arm_neon_la-neon.Plo create mode 100644 arm/Makefile create mode 100644 arm/Makefile.am create mode 100644 arm/Makefile.in create mode 100644 arm/arm.cc create mode 100644 arm/arm.h create mode 100644 arm/asm.S create mode 100644 arm/cpudetect.S create mode 100644 arm/hevcdsp_qpel_neon.S create mode 100644 arm/neon.S create mode 100644 bitstream.cc create mode 100644 cabac.cc create mode 100644 configparam.cc create mode 100644 contextmodel.cc create mode 100644 de265.cc create mode 100644 deblock.cc create mode 100644 decctx.cc create mode 100644 dpb.cc create mode 100644 fallback-dct.cc create mode 100644 fallback-motion.cc create mode 100644 fallback.cc create mode 100644 image-io.cc create mode 100644 image.cc create mode 100644 intrapred.cc create mode 100644 libde265/acceleration.h create mode 100644 libde265/alloc_pool.h create mode 100644 libde265/bitstream.h create mode 100644 libde265/cabac.h create mode 100644 libde265/configparam.h create mode 100644 libde265/contextmodel.h create mode 100644 libde265/de265-version.h create mode 100644 libde265/de265.h create mode 100644 libde265/deblock.h create mode 100644 libde265/decctx.h create mode 100644 libde265/dpb.h create mode 100644 libde265/en265.h create mode 100644 libde265/fallback-dct.h create mode 100644 libde265/fallback-motion.h create mode 100644 libde265/fallback.h create mode 100644 libde265/image-io.h create mode 100644 libde265/image.h create mode 100644 libde265/intrapred.h create mode 100644 libde265/md5.h create mode 100644 libde265/motion.h create mode 100644 libde265/nal-parser.h create mode 100644 libde265/nal.h create mode 100644 libde265/pps.h create mode 100644 libde265/quality.h create mode 100644 libde265/refpic.h create mode 100644 libde265/sao.h create mode 100644 libde265/scan.h create mode 100644 libde265/sei.h create mode 100644 libde265/slice.h create mode 100644 libde265/sps.h create mode 100644 libde265/threads.h create mode 100644 libde265/transform.h create mode 100644 libde265/util.h create mode 100644 libde265/visualize.h create mode 100644 libde265/vps.h create mode 100644 libde265/vui.h create mode 100644 md5.cc create mode 100644 motion.cc create mode 100644 nal-parser.cc create mode 100644 nal.cc create mode 100644 pps.cc create mode 100644 quality.cc create mode 100644 refpic.cc create mode 100644 sao.cc create mode 100644 scan.cc create mode 100644 sei.cc create mode 100644 slice.cc create mode 100644 sps.cc create mode 100644 threads.cc create mode 100644 transform.cc create mode 100644 util.cc create mode 100644 visualize.cc create mode 100644 vps.cc create mode 100644 vui.cc create mode 100644 x86/.deps/libde265_x86_la-sse.Plo create mode 100644 x86/.deps/libde265_x86_sse_la-sse-dct.Plo create mode 100644 x86/.deps/libde265_x86_sse_la-sse-motion.Plo create mode 100644 x86/CMakeLists.txt create mode 100644 x86/Makefile create mode 100644 x86/Makefile.am create mode 100644 x86/Makefile.in create mode 100644 x86/sse-dct.cc create mode 100644 x86/sse-dct.h create mode 100644 x86/sse-motion.cc create mode 100644 x86/sse-motion.h create mode 100644 x86/sse.cc create mode 100644 x86/sse.h diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..775f878 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +Release +*.la +*.o diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..b60e702 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,99 @@ +cmake_minimum_required(VERSION 3.16) + +project(debug_h265) + +include(CMakePackageConfigHelpers) + +set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_EXTENSIONS OFF) + +include_directories(libde265) +include_directories(.) + +set (libde265_sources + alloc_pool.cc + bitstream.cc + cabac.cc + configparam.cc + contextmodel.cc + de265.cc + deblock.cc + decctx.cc + dpb.cc + # en265.cc + fallback-dct.cc + fallback-motion.cc + fallback.cc + image-io.cc + image.cc + intrapred.cc + md5.cc + motion.cc + nal-parser.cc + nal.cc + pps.cc + quality.cc + refpic.cc + sao.cc + scan.cc + sei.cc + slice.cc + sps.cc + threads.cc + transform.cc + util.cc + visualize.cc + vps.cc + vui.cc +) + +set (libde265_headers + acceleration.h + alloc_pool.h + bitstream.h + cabac.h + configparam.h + deblock.h + decctx.h + dpb.h + en265.h + fallback-dct.h + fallback-motion.h + fallback.h + image-io.h + image.h + intrapred.h + md5.h + motion.h + nal-parser.h + nal.h + pps.h + quality.h + refpic.h + sao.h + scan.h + sei.h + slice.h + sps.h + threads.h + transform.h + util.h + visualize.h + vps.h + vui.h +) + + +add_definitions(-DLIBDE265_EXPORTS) + +#add_subdirectory (encoder) + +if(SUPPORTS_SSE4_1) + add_definitions(-DHAVE_SSE4_1) + add_subdirectory (x86) +endif() + +add_library(${PROJECT_NAME} STATIC ${libde265_sources} ${ENCODER_OBJECTS} ${X86_OBJECTS}) +find_package(Threads) +target_link_libraries(${PROJECT_NAME} PRIVATE Threads::Threads) diff --git a/alloc_pool.cc b/alloc_pool.cc new file mode 100644 index 0000000..b056397 --- /dev/null +++ b/alloc_pool.cc @@ -0,0 +1,100 @@ +/* + * H.265 video codec. + * Copyright (c) 2014 struktur AG, Dirk Farin + * + * Authors: Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include +#include "libde265/alloc_pool.h" +#include "libde265/util.h" +#include +#include + +#define DEBUG_MEMORY 1 + + +alloc_pool::alloc_pool(size_t objSize, int poolSize, bool grow) + : mObjSize(objSize), + mPoolSize(poolSize), + mGrow(grow) +{ + m_freeList.reserve(poolSize); + m_memBlocks.reserve(8); + + add_memory_block(); +} + + +void alloc_pool::add_memory_block() +{ + uint8_t* p = new uint8_t[mObjSize * mPoolSize]; + m_memBlocks.push_back(p); + + for (int i=0;i&2; \ + exit 1;; \ + esac; \ + has_opt=no; \ + sane_makeflags=$$MAKEFLAGS; \ + if $(am__is_gnu_make); then \ + sane_makeflags=$$MFLAGS; \ + else \ + case $$MAKEFLAGS in \ + *\\[\ \ ]*) \ + bs=\\; \ + sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \ + | sed "s/$$bs$$bs[$$bs $$bs ]*//g"`;; \ + esac; \ + fi; \ + skip_next=no; \ + strip_trailopt () \ + { \ + flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \ + }; \ + for flg in $$sane_makeflags; do \ + test $$skip_next = yes && { skip_next=no; continue; }; \ + case $$flg in \ + *=*|--*) continue;; \ + -*I) strip_trailopt 'I'; skip_next=yes;; \ + -*I?*) strip_trailopt 'I';; \ + -*O) strip_trailopt 'O'; skip_next=yes;; \ + -*O?*) strip_trailopt 'O';; \ + -*l) strip_trailopt 'l'; skip_next=yes;; \ + -*l?*) strip_trailopt 'l';; \ + -[dEDm]) skip_next=yes;; \ + -[JT]) skip_next=yes;; \ + esac; \ + case $$flg in \ + *$$target_option*) has_opt=yes; break;; \ + esac; \ + done; \ + test $$has_opt = yes +am__make_dryrun = (target_option=n; $(am__make_running_with_option)) +am__make_keepgoing = (target_option=k; $(am__make_running_with_option)) +pkgdatadir = $(datadir)/libde265 +pkgincludedir = $(includedir)/libde265 +pkglibdir = $(libdir)/libde265 +pkglibexecdir = $(libexecdir)/libde265 +am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd +install_sh_DATA = $(install_sh) -c -m 644 +install_sh_PROGRAM = $(install_sh) -c +install_sh_SCRIPT = $(install_sh) -c +INSTALL_HEADER = $(INSTALL_DATA) +transform = $(program_transform_name) +NORMAL_INSTALL = : +PRE_INSTALL = : +POST_INSTALL = : +NORMAL_UNINSTALL = : +PRE_UNINSTALL = : +POST_UNINSTALL = : +build_triplet = x86_64-pc-linux-gnu +host_triplet = x86_64-pc-linux-gnu +target_triplet = x86_64-pc-linux-gnu + +# NEON specific functions +#am__append_1 = libde265_arm_neon.la +#am__append_2 = libde265_arm_neon.la +subdir = libde265/arm +ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 +am__aclocal_m4_deps = $(top_srcdir)/m4/ax_compare_version.m4 \ + $(top_srcdir)/m4/ax_cxx_compile_stdcxx_11.m4 \ + $(top_srcdir)/m4/libtool.m4 $(top_srcdir)/m4/ltoptions.m4 \ + $(top_srcdir)/m4/ltsugar.m4 $(top_srcdir)/m4/ltversion.m4 \ + $(top_srcdir)/m4/lt~obsolete.m4 \ + $(top_srcdir)/m4/m4_ax_check_compile_flag.m4 \ + $(top_srcdir)/configure.ac +am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ + $(ACLOCAL_M4) +DIST_COMMON = $(srcdir)/Makefile.am $(am__DIST_COMMON) +mkinstalldirs = $(install_sh) -d +CONFIG_HEADER = $(top_builddir)/config.h +CONFIG_CLEAN_FILES = +CONFIG_CLEAN_VPATH_FILES = +LTLIBRARIES = $(noinst_LTLIBRARIES) +libde265_arm_la_DEPENDENCIES = $(am__append_2) +am_libde265_arm_la_OBJECTS = libde265_arm_la-arm.lo +libde265_arm_la_OBJECTS = $(am_libde265_arm_la_OBJECTS) +AM_V_lt = $(am__v_lt_$(V)) +am__v_lt_ = $(am__v_lt_$(AM_DEFAULT_VERBOSITY)) +am__v_lt_0 = --silent +am__v_lt_1 = +libde265_arm_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \ + $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \ + $(libde265_arm_la_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \ + $(LDFLAGS) -o $@ +libde265_arm_neon_la_LIBADD = +am__libde265_arm_neon_la_SOURCES_DIST = asm.S cpudetect.S \ + hevcdsp_qpel_neon.S neon.S +#am_libde265_arm_neon_la_OBJECTS = \ +# libde265_arm_neon_la-asm.lo \ +# libde265_arm_neon_la-cpudetect.lo \ +# libde265_arm_neon_la-hevcdsp_qpel_neon.lo \ +# libde265_arm_neon_la-neon.lo +libde265_arm_neon_la_OBJECTS = $(am_libde265_arm_neon_la_OBJECTS) +#am_libde265_arm_neon_la_rpath = +AM_V_P = $(am__v_P_$(V)) +am__v_P_ = $(am__v_P_$(AM_DEFAULT_VERBOSITY)) +am__v_P_0 = false +am__v_P_1 = : +AM_V_GEN = $(am__v_GEN_$(V)) +am__v_GEN_ = $(am__v_GEN_$(AM_DEFAULT_VERBOSITY)) +am__v_GEN_0 = @echo " GEN " $@; +am__v_GEN_1 = +AM_V_at = $(am__v_at_$(V)) +am__v_at_ = $(am__v_at_$(AM_DEFAULT_VERBOSITY)) +am__v_at_0 = @ +am__v_at_1 = +DEFAULT_INCLUDES = -I. -I$(top_builddir) +depcomp = $(SHELL) $(top_srcdir)/depcomp +am__maybe_remake_depfiles = depfiles +am__depfiles_remade = ./$(DEPDIR)/libde265_arm_la-arm.Plo \ + ./$(DEPDIR)/libde265_arm_neon_la-asm.Plo \ + ./$(DEPDIR)/libde265_arm_neon_la-cpudetect.Plo \ + ./$(DEPDIR)/libde265_arm_neon_la-hevcdsp_qpel_neon.Plo \ + ./$(DEPDIR)/libde265_arm_neon_la-neon.Plo +am__mv = mv -f +CPPASCOMPILE = $(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \ + $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CCASFLAGS) $(CCASFLAGS) +LTCPPASCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(DEFS) \ + $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \ + $(AM_CCASFLAGS) $(CCASFLAGS) +AM_V_CPPAS = $(am__v_CPPAS_$(V)) +am__v_CPPAS_ = $(am__v_CPPAS_$(AM_DEFAULT_VERBOSITY)) +am__v_CPPAS_0 = @echo " CPPAS " $@; +am__v_CPPAS_1 = +CXXCOMPILE = $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \ + $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) +LTCXXCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) \ + $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \ + $(AM_CXXFLAGS) $(CXXFLAGS) +AM_V_CXX = $(am__v_CXX_$(V)) +am__v_CXX_ = $(am__v_CXX_$(AM_DEFAULT_VERBOSITY)) +am__v_CXX_0 = @echo " CXX " $@; +am__v_CXX_1 = +CXXLD = $(CXX) +CXXLINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=link $(CXXLD) $(AM_CXXFLAGS) \ + $(CXXFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@ +AM_V_CXXLD = $(am__v_CXXLD_$(V)) +am__v_CXXLD_ = $(am__v_CXXLD_$(AM_DEFAULT_VERBOSITY)) +am__v_CXXLD_0 = @echo " CXXLD " $@; +am__v_CXXLD_1 = +COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \ + $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) +LTCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) \ + $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \ + $(AM_CFLAGS) $(CFLAGS) +AM_V_CC = $(am__v_CC_$(V)) +am__v_CC_ = $(am__v_CC_$(AM_DEFAULT_VERBOSITY)) +am__v_CC_0 = @echo " CC " $@; +am__v_CC_1 = +CCLD = $(CC) +LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \ + $(AM_LDFLAGS) $(LDFLAGS) -o $@ +AM_V_CCLD = $(am__v_CCLD_$(V)) +am__v_CCLD_ = $(am__v_CCLD_$(AM_DEFAULT_VERBOSITY)) +am__v_CCLD_0 = @echo " CCLD " $@; +am__v_CCLD_1 = +SOURCES = $(libde265_arm_la_SOURCES) $(libde265_arm_neon_la_SOURCES) +DIST_SOURCES = $(libde265_arm_la_SOURCES) \ + $(am__libde265_arm_neon_la_SOURCES_DIST) +am__can_run_installinfo = \ + case $$AM_UPDATE_INFO_DIR in \ + n|no|NO) false;; \ + *) (install-info --version) >/dev/null 2>&1;; \ + esac +am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP) +# Read a list of newline-separated strings from the standard input, +# and print each of them once, without duplicates. Input order is +# *not* preserved. +am__uniquify_input = $(AWK) '\ + BEGIN { nonempty = 0; } \ + { items[$$0] = 1; nonempty = 1; } \ + END { if (nonempty) { for (i in items) print i; }; } \ +' +# Make sure the list of sources is unique. This is necessary because, +# e.g., the same source file might be shared among _SOURCES variables +# for different programs/libraries. +am__define_uniq_tagged_files = \ + list='$(am__tagged_files)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | $(am__uniquify_input)` +ETAGS = etags +CTAGS = ctags +am__DIST_COMMON = $(srcdir)/Makefile.in $(top_srcdir)/depcomp +DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) +ACLOCAL = ${SHELL} /home/dima/git/libde265/missing aclocal-1.16 +ALLOCA = +AMTAR = $${TAR-tar} +AM_DEFAULT_VERBOSITY = 1 +AR = ar +AUTOCONF = ${SHELL} /home/dima/git/libde265/missing autoconf +AUTOHEADER = ${SHELL} /home/dima/git/libde265/missing autoheader +AUTOMAKE = ${SHELL} /home/dima/git/libde265/missing automake-1.16 +AWK = gawk +CC = gcc +CCAS = gcc +CCASDEPMODE = depmode=gcc3 +CCASFLAGS = -g -O2 +CCDEPMODE = depmode=gcc3 +CFLAGS = -g -O2 -std=c99 -Wall +CPP = gcc -E +CPPFLAGS = +CXX = g++ +CXXCPP = g++ -E +CXXDEPMODE = depmode=gcc3 +CXXFLAGS = -g -O2 -Werror=return-type -Werror=unused-result -Werror=reorder -DDE265_LOG_ERROR +CYGPATH_W = echo +DEFS = -DHAVE_CONFIG_H +DEPDIR = .deps +DLLTOOL = false +DSYMUTIL = +DUMPBIN = +ECHO_C = +ECHO_N = -n +ECHO_T = +EGREP = /usr/bin/grep -E +EXEEXT = +FGREP = /usr/bin/grep -F +GREP = /usr/bin/grep +HAVE_CXX11 = +INSTALL = /usr/bin/install -c +INSTALL_DATA = ${INSTALL} -m 644 +INSTALL_PROGRAM = ${INSTALL} +INSTALL_SCRIPT = ${INSTALL} +INSTALL_STRIP_PROGRAM = $(install_sh) -c -s +LD = /usr/bin/ld -m elf_x86_64 +LDFLAGS = +LIBDE265_AGE = 0 +LIBDE265_CURRENT = 0 +LIBDE265_REVISION = 12 +LIBOBJS = +LIBS = -lpthread -lm +LIBTOOL = $(SHELL) $(top_builddir)/libtool +LIPO = +LN_S = ln -s +LTLIBOBJS = +LT_SYS_LIBRARY_PATH = +MAKEINFO = ${SHELL} /home/dima/git/libde265/missing makeinfo +MANIFEST_TOOL = : +MKDIR_P = /usr/bin/mkdir -p +NM = /usr/bin/nm -B +NMEDIT = +NUMERIC_VERSION = 0x01000500 +OBJDUMP = objdump +OBJEXT = o +OTOOL = +OTOOL64 = +PACKAGE = libde265 +PACKAGE_BUGREPORT = farin@struktur.de +PACKAGE_NAME = libde265 +PACKAGE_STRING = libde265 1.0.5 +PACKAGE_TARNAME = libde265 +PACKAGE_URL = +PACKAGE_VERSION = 1.0.5 +PATH_SEPARATOR = : +PKG_CONFIG = /usr/bin/pkg-config +PKG_CONFIG_LIBDIR = +PKG_CONFIG_PATH = +QTCHOOSER = +QTMOC = /usr/bin/moc-qt5 +QT_CFLAGS = -I/usr/include/qt/QtCore -I/usr/include/qt -I/usr/include/qt/QtGui -DQT_WIDGETS_LIB -I/usr/include/qt/QtWidgets -DQT_GUI_LIB -DQT_CORE_LIB +QT_LIBS = -lQt5Widgets -lQt5Gui -lQt5Core +RANLIB = ranlib +SDL_CFLAGS = -I/usr/include/SDL -D_GNU_SOURCE=1 -D_REENTRANT +SDL_LIBS = -lSDL -lpthread +SED = /usr/bin/sed +SET_MAKE = +SHELL = /bin/sh +STRIP = strip +SWSCALE_CFLAGS = +SWSCALE_LIBS = -lswscale +VERSION = 1.0.5 +VIDEOGFX_CFLAGS = +VIDEOGFX_LIBS = +abs_builddir = /home/dima/git/libde265/libde265/arm +abs_srcdir = /home/dima/git/libde265/libde265/arm +abs_top_builddir = /home/dima/git/libde265 +abs_top_srcdir = /home/dima/git/libde265 +ac_ct_AR = ar +ac_ct_CC = gcc +ac_ct_CXX = g++ +ac_ct_DUMPBIN = +am__include = include +am__leading_dot = . +am__quote = +am__tar = $${TAR-tar} chof - "$$tardir" +am__untar = $${TAR-tar} xf - +bindir = ${exec_prefix}/bin +build = x86_64-pc-linux-gnu +build_alias = +build_cpu = x86_64 +build_os = linux-gnu +build_vendor = pc +builddir = . +datadir = ${datarootdir} +datarootdir = ${prefix}/share +docdir = ${datarootdir}/doc/${PACKAGE_TARNAME} +dvidir = ${docdir} +exec_prefix = ${prefix} +host = x86_64-pc-linux-gnu +host_alias = +host_cpu = x86_64 +host_os = linux-gnu +host_vendor = pc +htmldir = ${docdir} +includedir = ${prefix}/include +infodir = ${datarootdir}/info +install_sh = ${SHELL} /home/dima/git/libde265/install-sh +libdir = ${exec_prefix}/lib +libexecdir = ${exec_prefix}/libexec +localedir = ${datarootdir}/locale +localstatedir = ${prefix}/var +mandir = ${datarootdir}/man +mkdir_p = $(MKDIR_P) +oldincludedir = /usr/include +pdfdir = ${docdir} +prefix = /usr/local +program_transform_name = s,x,x, +psdir = ${docdir} +sbindir = ${exec_prefix}/sbin +sharedstatedir = ${prefix}/com +srcdir = . +sysconfdir = ${prefix}/etc +target = x86_64-pc-linux-gnu +target_alias = +target_cpu = x86_64 +target_os = linux-gnu +target_vendor = pc +top_build_prefix = ../../ +top_builddir = ../.. +top_srcdir = ../.. +noinst_LTLIBRARIES = libde265_arm.la $(am__append_1) +libde265_arm_la_CXXFLAGS = -I.. $(CFLAG_VISIBILITY) +libde265_arm_la_SOURCES = arm.cc arm.h +libde265_arm_la_LIBADD = $(am__append_2) +#libde265_arm_neon_la_CXXFLAGS = -mfpu=neon -I.. $(CFLAG_VISIBILITY) +#libde265_arm_neon_la_CCASFLAGS = -mfpu=neon -I.. \ +# -DHAVE_NEON \ +# -DEXTERN_ASM= \ +# -DHAVE_AS_FUNC \ +# -DHAVE_SECTION_DATA_REL_RO + +#libde265_arm_neon_la_SOURCES = \ +# asm.S \ +# cpudetect.S \ +# hevcdsp_qpel_neon.S \ +# neon.S + +all: all-am + +.SUFFIXES: +.SUFFIXES: .S .cc .lo .o .obj +$(srcdir)/Makefile.in: $(srcdir)/Makefile.am $(am__configure_deps) + @for dep in $?; do \ + case '$(am__configure_deps)' in \ + *$$dep*) \ + ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \ + && { if test -f $@; then exit 0; else break; fi; }; \ + exit 1;; \ + esac; \ + done; \ + echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu libde265/arm/Makefile'; \ + $(am__cd) $(top_srcdir) && \ + $(AUTOMAKE) --gnu libde265/arm/Makefile +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + @case '$?' in \ + *config.status*) \ + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \ + *) \ + echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles)'; \ + cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles);; \ + esac; + +$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh + +$(top_srcdir)/configure: $(am__configure_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(ACLOCAL_M4): $(am__aclocal_m4_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(am__aclocal_m4_deps): + +clean-noinstLTLIBRARIES: + -test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES) + @list='$(noinst_LTLIBRARIES)'; \ + locs=`for p in $$list; do echo $$p; done | \ + sed 's|^[^/]*$$|.|; s|/[^/]*$$||; s|$$|/so_locations|' | \ + sort -u`; \ + test -z "$$locs" || { \ + echo rm -f $${locs}; \ + rm -f $${locs}; \ + } + +libde265_arm.la: $(libde265_arm_la_OBJECTS) $(libde265_arm_la_DEPENDENCIES) $(EXTRA_libde265_arm_la_DEPENDENCIES) + $(AM_V_CXXLD)$(libde265_arm_la_LINK) $(libde265_arm_la_OBJECTS) $(libde265_arm_la_LIBADD) $(LIBS) + +libde265_arm_neon.la: $(libde265_arm_neon_la_OBJECTS) $(libde265_arm_neon_la_DEPENDENCIES) $(EXTRA_libde265_arm_neon_la_DEPENDENCIES) + $(AM_V_CCLD)$(LINK) $(am_libde265_arm_neon_la_rpath) $(libde265_arm_neon_la_OBJECTS) $(libde265_arm_neon_la_LIBADD) $(LIBS) + +mostlyclean-compile: + -rm -f *.$(OBJEXT) + +distclean-compile: + -rm -f *.tab.c + +include ./$(DEPDIR)/libde265_arm_la-arm.Plo # am--include-marker +include ./$(DEPDIR)/libde265_arm_neon_la-asm.Plo # am--include-marker +include ./$(DEPDIR)/libde265_arm_neon_la-cpudetect.Plo # am--include-marker +include ./$(DEPDIR)/libde265_arm_neon_la-hevcdsp_qpel_neon.Plo # am--include-marker +include ./$(DEPDIR)/libde265_arm_neon_la-neon.Plo # am--include-marker + +$(am__depfiles_remade): + @$(MKDIR_P) $(@D) + @echo '# dummy' >$@-t && $(am__mv) $@-t $@ + +am--depfiles: $(am__depfiles_remade) + +.S.o: + $(AM_V_CPPAS)$(CPPASCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< + $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po +# $(AM_V_CPPAS)source='$<' object='$@' libtool=no \ +# DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) \ +# $(AM_V_CPPAS_no)$(CPPASCOMPILE) -c -o $@ $< + +.S.obj: + $(AM_V_CPPAS)$(CPPASCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'` + $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po +# $(AM_V_CPPAS)source='$<' object='$@' libtool=no \ +# DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) \ +# $(AM_V_CPPAS_no)$(CPPASCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'` + +.S.lo: + $(AM_V_CPPAS)$(LTCPPASCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< + $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo +# $(AM_V_CPPAS)source='$<' object='$@' libtool=yes \ +# DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) \ +# $(AM_V_CPPAS_no)$(LTCPPASCOMPILE) -c -o $@ $< + +libde265_arm_neon_la-asm.lo: asm.S + $(AM_V_CPPAS)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_arm_neon_la_CCASFLAGS) $(CCASFLAGS) -MT libde265_arm_neon_la-asm.lo -MD -MP -MF $(DEPDIR)/libde265_arm_neon_la-asm.Tpo -c -o libde265_arm_neon_la-asm.lo `test -f 'asm.S' || echo '$(srcdir)/'`asm.S + $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_arm_neon_la-asm.Tpo $(DEPDIR)/libde265_arm_neon_la-asm.Plo +# $(AM_V_CPPAS)source='asm.S' object='libde265_arm_neon_la-asm.lo' libtool=yes \ +# DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) \ +# $(AM_V_CPPAS_no)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_arm_neon_la_CCASFLAGS) $(CCASFLAGS) -c -o libde265_arm_neon_la-asm.lo `test -f 'asm.S' || echo '$(srcdir)/'`asm.S + +libde265_arm_neon_la-cpudetect.lo: cpudetect.S + $(AM_V_CPPAS)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_arm_neon_la_CCASFLAGS) $(CCASFLAGS) -MT libde265_arm_neon_la-cpudetect.lo -MD -MP -MF $(DEPDIR)/libde265_arm_neon_la-cpudetect.Tpo -c -o libde265_arm_neon_la-cpudetect.lo `test -f 'cpudetect.S' || echo '$(srcdir)/'`cpudetect.S + $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_arm_neon_la-cpudetect.Tpo $(DEPDIR)/libde265_arm_neon_la-cpudetect.Plo +# $(AM_V_CPPAS)source='cpudetect.S' object='libde265_arm_neon_la-cpudetect.lo' libtool=yes \ +# DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) \ +# $(AM_V_CPPAS_no)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_arm_neon_la_CCASFLAGS) $(CCASFLAGS) -c -o libde265_arm_neon_la-cpudetect.lo `test -f 'cpudetect.S' || echo '$(srcdir)/'`cpudetect.S + +libde265_arm_neon_la-hevcdsp_qpel_neon.lo: hevcdsp_qpel_neon.S + $(AM_V_CPPAS)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_arm_neon_la_CCASFLAGS) $(CCASFLAGS) -MT libde265_arm_neon_la-hevcdsp_qpel_neon.lo -MD -MP -MF $(DEPDIR)/libde265_arm_neon_la-hevcdsp_qpel_neon.Tpo -c -o libde265_arm_neon_la-hevcdsp_qpel_neon.lo `test -f 'hevcdsp_qpel_neon.S' || echo '$(srcdir)/'`hevcdsp_qpel_neon.S + $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_arm_neon_la-hevcdsp_qpel_neon.Tpo $(DEPDIR)/libde265_arm_neon_la-hevcdsp_qpel_neon.Plo +# $(AM_V_CPPAS)source='hevcdsp_qpel_neon.S' object='libde265_arm_neon_la-hevcdsp_qpel_neon.lo' libtool=yes \ +# DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) \ +# $(AM_V_CPPAS_no)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_arm_neon_la_CCASFLAGS) $(CCASFLAGS) -c -o libde265_arm_neon_la-hevcdsp_qpel_neon.lo `test -f 'hevcdsp_qpel_neon.S' || echo '$(srcdir)/'`hevcdsp_qpel_neon.S + +libde265_arm_neon_la-neon.lo: neon.S + $(AM_V_CPPAS)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_arm_neon_la_CCASFLAGS) $(CCASFLAGS) -MT libde265_arm_neon_la-neon.lo -MD -MP -MF $(DEPDIR)/libde265_arm_neon_la-neon.Tpo -c -o libde265_arm_neon_la-neon.lo `test -f 'neon.S' || echo '$(srcdir)/'`neon.S + $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_arm_neon_la-neon.Tpo $(DEPDIR)/libde265_arm_neon_la-neon.Plo +# $(AM_V_CPPAS)source='neon.S' object='libde265_arm_neon_la-neon.lo' libtool=yes \ +# DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) \ +# $(AM_V_CPPAS_no)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_arm_neon_la_CCASFLAGS) $(CCASFLAGS) -c -o libde265_arm_neon_la-neon.lo `test -f 'neon.S' || echo '$(srcdir)/'`neon.S + +.cc.o: + $(AM_V_CXX)$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< + $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po +# $(AM_V_CXX)source='$<' object='$@' libtool=no \ +# DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) \ +# $(AM_V_CXX_no)$(CXXCOMPILE) -c -o $@ $< + +.cc.obj: + $(AM_V_CXX)$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'` + $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po +# $(AM_V_CXX)source='$<' object='$@' libtool=no \ +# DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) \ +# $(AM_V_CXX_no)$(CXXCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'` + +.cc.lo: + $(AM_V_CXX)$(LTCXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< + $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo +# $(AM_V_CXX)source='$<' object='$@' libtool=yes \ +# DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) \ +# $(AM_V_CXX_no)$(LTCXXCOMPILE) -c -o $@ $< + +libde265_arm_la-arm.lo: arm.cc + $(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_arm_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_arm_la-arm.lo -MD -MP -MF $(DEPDIR)/libde265_arm_la-arm.Tpo -c -o libde265_arm_la-arm.lo `test -f 'arm.cc' || echo '$(srcdir)/'`arm.cc + $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_arm_la-arm.Tpo $(DEPDIR)/libde265_arm_la-arm.Plo +# $(AM_V_CXX)source='arm.cc' object='libde265_arm_la-arm.lo' libtool=yes \ +# DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) \ +# $(AM_V_CXX_no)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_arm_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_arm_la-arm.lo `test -f 'arm.cc' || echo '$(srcdir)/'`arm.cc + +mostlyclean-libtool: + -rm -f *.lo + +clean-libtool: + -rm -rf .libs _libs + +ID: $(am__tagged_files) + $(am__define_uniq_tagged_files); mkid -fID $$unique +tags: tags-am +TAGS: tags + +tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files) + set x; \ + here=`pwd`; \ + $(am__define_uniq_tagged_files); \ + shift; \ + if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \ + test -n "$$unique" || unique=$$empty_fix; \ + if test $$# -gt 0; then \ + $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ + "$$@" $$unique; \ + else \ + $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ + $$unique; \ + fi; \ + fi +ctags: ctags-am + +CTAGS: ctags +ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files) + $(am__define_uniq_tagged_files); \ + test -z "$(CTAGS_ARGS)$$unique" \ + || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ + $$unique + +GTAGS: + here=`$(am__cd) $(top_builddir) && pwd` \ + && $(am__cd) $(top_srcdir) \ + && gtags -i $(GTAGS_ARGS) "$$here" +cscopelist: cscopelist-am + +cscopelist-am: $(am__tagged_files) + list='$(am__tagged_files)'; \ + case "$(srcdir)" in \ + [\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \ + *) sdir=$(subdir)/$(srcdir) ;; \ + esac; \ + for i in $$list; do \ + if test -f "$$i"; then \ + echo "$(subdir)/$$i"; \ + else \ + echo "$$sdir/$$i"; \ + fi; \ + done >> $(top_builddir)/cscope.files + +distclean-tags: + -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags + +distdir: $(BUILT_SOURCES) + $(MAKE) $(AM_MAKEFLAGS) distdir-am + +distdir-am: $(DISTFILES) + @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ + topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ + list='$(DISTFILES)'; \ + dist_files=`for file in $$list; do echo $$file; done | \ + sed -e "s|^$$srcdirstrip/||;t" \ + -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \ + case $$dist_files in \ + */*) $(MKDIR_P) `echo "$$dist_files" | \ + sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \ + sort -u` ;; \ + esac; \ + for file in $$dist_files; do \ + if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ + if test -d $$d/$$file; then \ + dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \ + if test -d "$(distdir)/$$file"; then \ + find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ + fi; \ + if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ + cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \ + find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ + fi; \ + cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \ + else \ + test -f "$(distdir)/$$file" \ + || cp -p $$d/$$file "$(distdir)/$$file" \ + || exit 1; \ + fi; \ + done +check-am: all-am +check: check-am +all-am: Makefile $(LTLIBRARIES) +installdirs: +install: install-am +install-exec: install-exec-am +install-data: install-data-am +uninstall: uninstall-am + +install-am: all-am + @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am + +installcheck: installcheck-am +install-strip: + if test -z '$(STRIP)'; then \ + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + install; \ + else \ + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \ + fi +mostlyclean-generic: + +clean-generic: + +distclean-generic: + -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) + -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES) + +maintainer-clean-generic: + @echo "This command is intended for maintainers to use" + @echo "it deletes files that may require special tools to rebuild." +clean: clean-am + +clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \ + mostlyclean-am + +distclean: distclean-am + -rm -f ./$(DEPDIR)/libde265_arm_la-arm.Plo + -rm -f ./$(DEPDIR)/libde265_arm_neon_la-asm.Plo + -rm -f ./$(DEPDIR)/libde265_arm_neon_la-cpudetect.Plo + -rm -f ./$(DEPDIR)/libde265_arm_neon_la-hevcdsp_qpel_neon.Plo + -rm -f ./$(DEPDIR)/libde265_arm_neon_la-neon.Plo + -rm -f Makefile +distclean-am: clean-am distclean-compile distclean-generic \ + distclean-tags + +dvi: dvi-am + +dvi-am: + +html: html-am + +html-am: + +info: info-am + +info-am: + +install-data-am: + +install-dvi: install-dvi-am + +install-dvi-am: + +install-exec-am: + +install-html: install-html-am + +install-html-am: + +install-info: install-info-am + +install-info-am: + +install-man: + +install-pdf: install-pdf-am + +install-pdf-am: + +install-ps: install-ps-am + +install-ps-am: + +installcheck-am: + +maintainer-clean: maintainer-clean-am + -rm -f ./$(DEPDIR)/libde265_arm_la-arm.Plo + -rm -f ./$(DEPDIR)/libde265_arm_neon_la-asm.Plo + -rm -f ./$(DEPDIR)/libde265_arm_neon_la-cpudetect.Plo + -rm -f ./$(DEPDIR)/libde265_arm_neon_la-hevcdsp_qpel_neon.Plo + -rm -f ./$(DEPDIR)/libde265_arm_neon_la-neon.Plo + -rm -f Makefile +maintainer-clean-am: distclean-am maintainer-clean-generic + +mostlyclean: mostlyclean-am + +mostlyclean-am: mostlyclean-compile mostlyclean-generic \ + mostlyclean-libtool + +pdf: pdf-am + +pdf-am: + +ps: ps-am + +ps-am: + +uninstall-am: + +.MAKE: install-am install-strip + +.PHONY: CTAGS GTAGS TAGS all all-am am--depfiles check check-am clean \ + clean-generic clean-libtool clean-noinstLTLIBRARIES \ + cscopelist-am ctags ctags-am distclean distclean-compile \ + distclean-generic distclean-libtool distclean-tags distdir dvi \ + dvi-am html html-am info info-am install install-am \ + install-data install-data-am install-dvi install-dvi-am \ + install-exec install-exec-am install-html install-html-am \ + install-info install-info-am install-man install-pdf \ + install-pdf-am install-ps install-ps-am install-strip \ + installcheck installcheck-am installdirs maintainer-clean \ + maintainer-clean-generic mostlyclean mostlyclean-compile \ + mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \ + tags tags-am uninstall uninstall-am + +.PRECIOUS: Makefile + + +# libde265_arm_la_CXXFLAGS += -DHAVE_VISIBILITY + +# libde265_arm_neon_la_CCASFLAGS += -DCONFIG_THUMB + +## libde265_arm_neon_la_CXXFLAGS += -DHAVE_VISIBILITY + +# Tell versions [3.59,3.63) of GNU make to not export all variables. +# Otherwise a system limit (for SysV at least) may be exceeded. +.NOEXPORT: diff --git a/arm/Makefile.am b/arm/Makefile.am new file mode 100644 index 0000000..9ef62d9 --- /dev/null +++ b/arm/Makefile.am @@ -0,0 +1,38 @@ +noinst_LTLIBRARIES = libde265_arm.la + +libde265_arm_la_CXXFLAGS = -I.. $(CFLAG_VISIBILITY) +libde265_arm_la_SOURCES = arm.cc arm.h +libde265_arm_la_LIBADD = + +if HAVE_VISIBILITY + libde265_arm_la_CXXFLAGS += -DHAVE_VISIBILITY +endif + + +if ENABLE_NEON_OPT +# NEON specific functions + +noinst_LTLIBRARIES += libde265_arm_neon.la +libde265_arm_la_LIBADD += libde265_arm_neon.la +libde265_arm_neon_la_CXXFLAGS = -mfpu=neon -I.. $(CFLAG_VISIBILITY) +libde265_arm_neon_la_CCASFLAGS = -mfpu=neon -I.. \ + -DHAVE_NEON \ + -DEXTERN_ASM= \ + -DHAVE_AS_FUNC \ + -DHAVE_SECTION_DATA_REL_RO + +if ENABLE_ARM_THUMB + libde265_arm_neon_la_CCASFLAGS += -DCONFIG_THUMB +endif + +libde265_arm_neon_la_SOURCES = \ + asm.S \ + cpudetect.S \ + hevcdsp_qpel_neon.S \ + neon.S + +if HAVE_VISIBILITY + libde265_arm_neon_la_CXXFLAGS += -DHAVE_VISIBILITY +endif + +endif diff --git a/arm/Makefile.in b/arm/Makefile.in new file mode 100644 index 0000000..fb1575b --- /dev/null +++ b/arm/Makefile.in @@ -0,0 +1,770 @@ +# Makefile.in generated by automake 1.16.1 from Makefile.am. +# @configure_input@ + +# Copyright (C) 1994-2018 Free Software Foundation, Inc. + +# This Makefile.in is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY, to the extent permitted by law; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. + +@SET_MAKE@ + +VPATH = @srcdir@ +am__is_gnu_make = { \ + if test -z '$(MAKELEVEL)'; then \ + false; \ + elif test -n '$(MAKE_HOST)'; then \ + true; \ + elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \ + true; \ + else \ + false; \ + fi; \ +} +am__make_running_with_option = \ + case $${target_option-} in \ + ?) ;; \ + *) echo "am__make_running_with_option: internal error: invalid" \ + "target option '$${target_option-}' specified" >&2; \ + exit 1;; \ + esac; \ + has_opt=no; \ + sane_makeflags=$$MAKEFLAGS; \ + if $(am__is_gnu_make); then \ + sane_makeflags=$$MFLAGS; \ + else \ + case $$MAKEFLAGS in \ + *\\[\ \ ]*) \ + bs=\\; \ + sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \ + | sed "s/$$bs$$bs[$$bs $$bs ]*//g"`;; \ + esac; \ + fi; \ + skip_next=no; \ + strip_trailopt () \ + { \ + flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \ + }; \ + for flg in $$sane_makeflags; do \ + test $$skip_next = yes && { skip_next=no; continue; }; \ + case $$flg in \ + *=*|--*) continue;; \ + -*I) strip_trailopt 'I'; skip_next=yes;; \ + -*I?*) strip_trailopt 'I';; \ + -*O) strip_trailopt 'O'; skip_next=yes;; \ + -*O?*) strip_trailopt 'O';; \ + -*l) strip_trailopt 'l'; skip_next=yes;; \ + -*l?*) strip_trailopt 'l';; \ + -[dEDm]) skip_next=yes;; \ + -[JT]) skip_next=yes;; \ + esac; \ + case $$flg in \ + *$$target_option*) has_opt=yes; break;; \ + esac; \ + done; \ + test $$has_opt = yes +am__make_dryrun = (target_option=n; $(am__make_running_with_option)) +am__make_keepgoing = (target_option=k; $(am__make_running_with_option)) +pkgdatadir = $(datadir)/@PACKAGE@ +pkgincludedir = $(includedir)/@PACKAGE@ +pkglibdir = $(libdir)/@PACKAGE@ +pkglibexecdir = $(libexecdir)/@PACKAGE@ +am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd +install_sh_DATA = $(install_sh) -c -m 644 +install_sh_PROGRAM = $(install_sh) -c +install_sh_SCRIPT = $(install_sh) -c +INSTALL_HEADER = $(INSTALL_DATA) +transform = $(program_transform_name) +NORMAL_INSTALL = : +PRE_INSTALL = : +POST_INSTALL = : +NORMAL_UNINSTALL = : +PRE_UNINSTALL = : +POST_UNINSTALL = : +build_triplet = @build@ +host_triplet = @host@ +target_triplet = @target@ + +# NEON specific functions +@ENABLE_NEON_OPT_TRUE@am__append_1 = libde265_arm_neon.la +@ENABLE_NEON_OPT_TRUE@am__append_2 = libde265_arm_neon.la +subdir = libde265/arm +ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 +am__aclocal_m4_deps = $(top_srcdir)/m4/ax_compare_version.m4 \ + $(top_srcdir)/m4/ax_cxx_compile_stdcxx_11.m4 \ + $(top_srcdir)/m4/libtool.m4 $(top_srcdir)/m4/ltoptions.m4 \ + $(top_srcdir)/m4/ltsugar.m4 $(top_srcdir)/m4/ltversion.m4 \ + $(top_srcdir)/m4/lt~obsolete.m4 \ + $(top_srcdir)/m4/m4_ax_check_compile_flag.m4 \ + $(top_srcdir)/configure.ac +am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ + $(ACLOCAL_M4) +DIST_COMMON = $(srcdir)/Makefile.am $(am__DIST_COMMON) +mkinstalldirs = $(install_sh) -d +CONFIG_HEADER = $(top_builddir)/config.h +CONFIG_CLEAN_FILES = +CONFIG_CLEAN_VPATH_FILES = +LTLIBRARIES = $(noinst_LTLIBRARIES) +libde265_arm_la_DEPENDENCIES = $(am__append_2) +am_libde265_arm_la_OBJECTS = libde265_arm_la-arm.lo +libde265_arm_la_OBJECTS = $(am_libde265_arm_la_OBJECTS) +AM_V_lt = $(am__v_lt_@AM_V@) +am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@) +am__v_lt_0 = --silent +am__v_lt_1 = +libde265_arm_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \ + $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \ + $(libde265_arm_la_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \ + $(LDFLAGS) -o $@ +libde265_arm_neon_la_LIBADD = +am__libde265_arm_neon_la_SOURCES_DIST = asm.S cpudetect.S \ + hevcdsp_qpel_neon.S neon.S +@ENABLE_NEON_OPT_TRUE@am_libde265_arm_neon_la_OBJECTS = \ +@ENABLE_NEON_OPT_TRUE@ libde265_arm_neon_la-asm.lo \ +@ENABLE_NEON_OPT_TRUE@ libde265_arm_neon_la-cpudetect.lo \ +@ENABLE_NEON_OPT_TRUE@ libde265_arm_neon_la-hevcdsp_qpel_neon.lo \ +@ENABLE_NEON_OPT_TRUE@ libde265_arm_neon_la-neon.lo +libde265_arm_neon_la_OBJECTS = $(am_libde265_arm_neon_la_OBJECTS) +@ENABLE_NEON_OPT_TRUE@am_libde265_arm_neon_la_rpath = +AM_V_P = $(am__v_P_@AM_V@) +am__v_P_ = $(am__v_P_@AM_DEFAULT_V@) +am__v_P_0 = false +am__v_P_1 = : +AM_V_GEN = $(am__v_GEN_@AM_V@) +am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@) +am__v_GEN_0 = @echo " GEN " $@; +am__v_GEN_1 = +AM_V_at = $(am__v_at_@AM_V@) +am__v_at_ = $(am__v_at_@AM_DEFAULT_V@) +am__v_at_0 = @ +am__v_at_1 = +DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir) +depcomp = $(SHELL) $(top_srcdir)/depcomp +am__maybe_remake_depfiles = depfiles +am__depfiles_remade = ./$(DEPDIR)/libde265_arm_la-arm.Plo \ + ./$(DEPDIR)/libde265_arm_neon_la-asm.Plo \ + ./$(DEPDIR)/libde265_arm_neon_la-cpudetect.Plo \ + ./$(DEPDIR)/libde265_arm_neon_la-hevcdsp_qpel_neon.Plo \ + ./$(DEPDIR)/libde265_arm_neon_la-neon.Plo +am__mv = mv -f +CPPASCOMPILE = $(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \ + $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CCASFLAGS) $(CCASFLAGS) +LTCPPASCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(DEFS) \ + $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \ + $(AM_CCASFLAGS) $(CCASFLAGS) +AM_V_CPPAS = $(am__v_CPPAS_@AM_V@) +am__v_CPPAS_ = $(am__v_CPPAS_@AM_DEFAULT_V@) +am__v_CPPAS_0 = @echo " CPPAS " $@; +am__v_CPPAS_1 = +CXXCOMPILE = $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \ + $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) +LTCXXCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) \ + $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \ + $(AM_CXXFLAGS) $(CXXFLAGS) +AM_V_CXX = $(am__v_CXX_@AM_V@) +am__v_CXX_ = $(am__v_CXX_@AM_DEFAULT_V@) +am__v_CXX_0 = @echo " CXX " $@; +am__v_CXX_1 = +CXXLD = $(CXX) +CXXLINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=link $(CXXLD) $(AM_CXXFLAGS) \ + $(CXXFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@ +AM_V_CXXLD = $(am__v_CXXLD_@AM_V@) +am__v_CXXLD_ = $(am__v_CXXLD_@AM_DEFAULT_V@) +am__v_CXXLD_0 = @echo " CXXLD " $@; +am__v_CXXLD_1 = +COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \ + $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) +LTCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) \ + $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \ + $(AM_CFLAGS) $(CFLAGS) +AM_V_CC = $(am__v_CC_@AM_V@) +am__v_CC_ = $(am__v_CC_@AM_DEFAULT_V@) +am__v_CC_0 = @echo " CC " $@; +am__v_CC_1 = +CCLD = $(CC) +LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \ + $(AM_LDFLAGS) $(LDFLAGS) -o $@ +AM_V_CCLD = $(am__v_CCLD_@AM_V@) +am__v_CCLD_ = $(am__v_CCLD_@AM_DEFAULT_V@) +am__v_CCLD_0 = @echo " CCLD " $@; +am__v_CCLD_1 = +SOURCES = $(libde265_arm_la_SOURCES) $(libde265_arm_neon_la_SOURCES) +DIST_SOURCES = $(libde265_arm_la_SOURCES) \ + $(am__libde265_arm_neon_la_SOURCES_DIST) +am__can_run_installinfo = \ + case $$AM_UPDATE_INFO_DIR in \ + n|no|NO) false;; \ + *) (install-info --version) >/dev/null 2>&1;; \ + esac +am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP) +# Read a list of newline-separated strings from the standard input, +# and print each of them once, without duplicates. Input order is +# *not* preserved. +am__uniquify_input = $(AWK) '\ + BEGIN { nonempty = 0; } \ + { items[$$0] = 1; nonempty = 1; } \ + END { if (nonempty) { for (i in items) print i; }; } \ +' +# Make sure the list of sources is unique. This is necessary because, +# e.g., the same source file might be shared among _SOURCES variables +# for different programs/libraries. +am__define_uniq_tagged_files = \ + list='$(am__tagged_files)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | $(am__uniquify_input)` +ETAGS = etags +CTAGS = ctags +am__DIST_COMMON = $(srcdir)/Makefile.in $(top_srcdir)/depcomp +DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) +ACLOCAL = @ACLOCAL@ +ALLOCA = @ALLOCA@ +AMTAR = @AMTAR@ +AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ +AR = @AR@ +AUTOCONF = @AUTOCONF@ +AUTOHEADER = @AUTOHEADER@ +AUTOMAKE = @AUTOMAKE@ +AWK = @AWK@ +CC = @CC@ +CCAS = @CCAS@ +CCASDEPMODE = @CCASDEPMODE@ +CCASFLAGS = @CCASFLAGS@ +CCDEPMODE = @CCDEPMODE@ +CFLAGS = @CFLAGS@ +CPP = @CPP@ +CPPFLAGS = @CPPFLAGS@ +CXX = @CXX@ +CXXCPP = @CXXCPP@ +CXXDEPMODE = @CXXDEPMODE@ +CXXFLAGS = @CXXFLAGS@ +CYGPATH_W = @CYGPATH_W@ +DEFS = @DEFS@ +DEPDIR = @DEPDIR@ +DLLTOOL = @DLLTOOL@ +DSYMUTIL = @DSYMUTIL@ +DUMPBIN = @DUMPBIN@ +ECHO_C = @ECHO_C@ +ECHO_N = @ECHO_N@ +ECHO_T = @ECHO_T@ +EGREP = @EGREP@ +EXEEXT = @EXEEXT@ +FGREP = @FGREP@ +GREP = @GREP@ +HAVE_CXX11 = @HAVE_CXX11@ +INSTALL = @INSTALL@ +INSTALL_DATA = @INSTALL_DATA@ +INSTALL_PROGRAM = @INSTALL_PROGRAM@ +INSTALL_SCRIPT = @INSTALL_SCRIPT@ +INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ +LD = @LD@ +LDFLAGS = @LDFLAGS@ +LIBDE265_AGE = @LIBDE265_AGE@ +LIBDE265_CURRENT = @LIBDE265_CURRENT@ +LIBDE265_REVISION = @LIBDE265_REVISION@ +LIBOBJS = @LIBOBJS@ +LIBS = @LIBS@ +LIBTOOL = @LIBTOOL@ +LIPO = @LIPO@ +LN_S = @LN_S@ +LTLIBOBJS = @LTLIBOBJS@ +LT_SYS_LIBRARY_PATH = @LT_SYS_LIBRARY_PATH@ +MAKEINFO = @MAKEINFO@ +MANIFEST_TOOL = @MANIFEST_TOOL@ +MKDIR_P = @MKDIR_P@ +NM = @NM@ +NMEDIT = @NMEDIT@ +NUMERIC_VERSION = @NUMERIC_VERSION@ +OBJDUMP = @OBJDUMP@ +OBJEXT = @OBJEXT@ +OTOOL = @OTOOL@ +OTOOL64 = @OTOOL64@ +PACKAGE = @PACKAGE@ +PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@ +PACKAGE_NAME = @PACKAGE_NAME@ +PACKAGE_STRING = @PACKAGE_STRING@ +PACKAGE_TARNAME = @PACKAGE_TARNAME@ +PACKAGE_URL = @PACKAGE_URL@ +PACKAGE_VERSION = @PACKAGE_VERSION@ +PATH_SEPARATOR = @PATH_SEPARATOR@ +PKG_CONFIG = @PKG_CONFIG@ +PKG_CONFIG_LIBDIR = @PKG_CONFIG_LIBDIR@ +PKG_CONFIG_PATH = @PKG_CONFIG_PATH@ +QTCHOOSER = @QTCHOOSER@ +QTMOC = @QTMOC@ +QT_CFLAGS = @QT_CFLAGS@ +QT_LIBS = @QT_LIBS@ +RANLIB = @RANLIB@ +SDL_CFLAGS = @SDL_CFLAGS@ +SDL_LIBS = @SDL_LIBS@ +SED = @SED@ +SET_MAKE = @SET_MAKE@ +SHELL = @SHELL@ +STRIP = @STRIP@ +SWSCALE_CFLAGS = @SWSCALE_CFLAGS@ +SWSCALE_LIBS = @SWSCALE_LIBS@ +VERSION = @VERSION@ +VIDEOGFX_CFLAGS = @VIDEOGFX_CFLAGS@ +VIDEOGFX_LIBS = @VIDEOGFX_LIBS@ +abs_builddir = @abs_builddir@ +abs_srcdir = @abs_srcdir@ +abs_top_builddir = @abs_top_builddir@ +abs_top_srcdir = @abs_top_srcdir@ +ac_ct_AR = @ac_ct_AR@ +ac_ct_CC = @ac_ct_CC@ +ac_ct_CXX = @ac_ct_CXX@ +ac_ct_DUMPBIN = @ac_ct_DUMPBIN@ +am__include = @am__include@ +am__leading_dot = @am__leading_dot@ +am__quote = @am__quote@ +am__tar = @am__tar@ +am__untar = @am__untar@ +bindir = @bindir@ +build = @build@ +build_alias = @build_alias@ +build_cpu = @build_cpu@ +build_os = @build_os@ +build_vendor = @build_vendor@ +builddir = @builddir@ +datadir = @datadir@ +datarootdir = @datarootdir@ +docdir = @docdir@ +dvidir = @dvidir@ +exec_prefix = @exec_prefix@ +host = @host@ +host_alias = @host_alias@ +host_cpu = @host_cpu@ +host_os = @host_os@ +host_vendor = @host_vendor@ +htmldir = @htmldir@ +includedir = @includedir@ +infodir = @infodir@ +install_sh = @install_sh@ +libdir = @libdir@ +libexecdir = @libexecdir@ +localedir = @localedir@ +localstatedir = @localstatedir@ +mandir = @mandir@ +mkdir_p = @mkdir_p@ +oldincludedir = @oldincludedir@ +pdfdir = @pdfdir@ +prefix = @prefix@ +program_transform_name = @program_transform_name@ +psdir = @psdir@ +sbindir = @sbindir@ +sharedstatedir = @sharedstatedir@ +srcdir = @srcdir@ +sysconfdir = @sysconfdir@ +target = @target@ +target_alias = @target_alias@ +target_cpu = @target_cpu@ +target_os = @target_os@ +target_vendor = @target_vendor@ +top_build_prefix = @top_build_prefix@ +top_builddir = @top_builddir@ +top_srcdir = @top_srcdir@ +noinst_LTLIBRARIES = libde265_arm.la $(am__append_1) +libde265_arm_la_CXXFLAGS = -I.. $(CFLAG_VISIBILITY) +libde265_arm_la_SOURCES = arm.cc arm.h +libde265_arm_la_LIBADD = $(am__append_2) +@ENABLE_NEON_OPT_TRUE@libde265_arm_neon_la_CXXFLAGS = -mfpu=neon -I.. $(CFLAG_VISIBILITY) +@ENABLE_NEON_OPT_TRUE@libde265_arm_neon_la_CCASFLAGS = -mfpu=neon -I.. \ +@ENABLE_NEON_OPT_TRUE@ -DHAVE_NEON \ +@ENABLE_NEON_OPT_TRUE@ -DEXTERN_ASM= \ +@ENABLE_NEON_OPT_TRUE@ -DHAVE_AS_FUNC \ +@ENABLE_NEON_OPT_TRUE@ -DHAVE_SECTION_DATA_REL_RO + +@ENABLE_NEON_OPT_TRUE@libde265_arm_neon_la_SOURCES = \ +@ENABLE_NEON_OPT_TRUE@ asm.S \ +@ENABLE_NEON_OPT_TRUE@ cpudetect.S \ +@ENABLE_NEON_OPT_TRUE@ hevcdsp_qpel_neon.S \ +@ENABLE_NEON_OPT_TRUE@ neon.S + +all: all-am + +.SUFFIXES: +.SUFFIXES: .S .cc .lo .o .obj +$(srcdir)/Makefile.in: $(srcdir)/Makefile.am $(am__configure_deps) + @for dep in $?; do \ + case '$(am__configure_deps)' in \ + *$$dep*) \ + ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \ + && { if test -f $@; then exit 0; else break; fi; }; \ + exit 1;; \ + esac; \ + done; \ + echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu libde265/arm/Makefile'; \ + $(am__cd) $(top_srcdir) && \ + $(AUTOMAKE) --gnu libde265/arm/Makefile +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + @case '$?' in \ + *config.status*) \ + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \ + *) \ + echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles)'; \ + cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles);; \ + esac; + +$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh + +$(top_srcdir)/configure: $(am__configure_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(ACLOCAL_M4): $(am__aclocal_m4_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(am__aclocal_m4_deps): + +clean-noinstLTLIBRARIES: + -test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES) + @list='$(noinst_LTLIBRARIES)'; \ + locs=`for p in $$list; do echo $$p; done | \ + sed 's|^[^/]*$$|.|; s|/[^/]*$$||; s|$$|/so_locations|' | \ + sort -u`; \ + test -z "$$locs" || { \ + echo rm -f $${locs}; \ + rm -f $${locs}; \ + } + +libde265_arm.la: $(libde265_arm_la_OBJECTS) $(libde265_arm_la_DEPENDENCIES) $(EXTRA_libde265_arm_la_DEPENDENCIES) + $(AM_V_CXXLD)$(libde265_arm_la_LINK) $(libde265_arm_la_OBJECTS) $(libde265_arm_la_LIBADD) $(LIBS) + +libde265_arm_neon.la: $(libde265_arm_neon_la_OBJECTS) $(libde265_arm_neon_la_DEPENDENCIES) $(EXTRA_libde265_arm_neon_la_DEPENDENCIES) + $(AM_V_CCLD)$(LINK) $(am_libde265_arm_neon_la_rpath) $(libde265_arm_neon_la_OBJECTS) $(libde265_arm_neon_la_LIBADD) $(LIBS) + +mostlyclean-compile: + -rm -f *.$(OBJEXT) + +distclean-compile: + -rm -f *.tab.c + +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libde265_arm_la-arm.Plo@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libde265_arm_neon_la-asm.Plo@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libde265_arm_neon_la-cpudetect.Plo@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libde265_arm_neon_la-hevcdsp_qpel_neon.Plo@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libde265_arm_neon_la-neon.Plo@am__quote@ # am--include-marker + +$(am__depfiles_remade): + @$(MKDIR_P) $(@D) + @echo '# dummy' >$@-t && $(am__mv) $@-t $@ + +am--depfiles: $(am__depfiles_remade) + +.S.o: +@am__fastdepCCAS_TRUE@ $(AM_V_CPPAS)$(CPPASCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< +@am__fastdepCCAS_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po +@AMDEP_TRUE@@am__fastdepCCAS_FALSE@ $(AM_V_CPPAS)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCCAS_FALSE@ DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCCAS_FALSE@ $(AM_V_CPPAS@am__nodep@)$(CPPASCOMPILE) -c -o $@ $< + +.S.obj: +@am__fastdepCCAS_TRUE@ $(AM_V_CPPAS)$(CPPASCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'` +@am__fastdepCCAS_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po +@AMDEP_TRUE@@am__fastdepCCAS_FALSE@ $(AM_V_CPPAS)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCCAS_FALSE@ DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCCAS_FALSE@ $(AM_V_CPPAS@am__nodep@)$(CPPASCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'` + +.S.lo: +@am__fastdepCCAS_TRUE@ $(AM_V_CPPAS)$(LTCPPASCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< +@am__fastdepCCAS_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo +@AMDEP_TRUE@@am__fastdepCCAS_FALSE@ $(AM_V_CPPAS)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCCAS_FALSE@ DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCCAS_FALSE@ $(AM_V_CPPAS@am__nodep@)$(LTCPPASCOMPILE) -c -o $@ $< + +libde265_arm_neon_la-asm.lo: asm.S +@am__fastdepCCAS_TRUE@ $(AM_V_CPPAS)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_arm_neon_la_CCASFLAGS) $(CCASFLAGS) -MT libde265_arm_neon_la-asm.lo -MD -MP -MF $(DEPDIR)/libde265_arm_neon_la-asm.Tpo -c -o libde265_arm_neon_la-asm.lo `test -f 'asm.S' || echo '$(srcdir)/'`asm.S +@am__fastdepCCAS_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_arm_neon_la-asm.Tpo $(DEPDIR)/libde265_arm_neon_la-asm.Plo +@AMDEP_TRUE@@am__fastdepCCAS_FALSE@ $(AM_V_CPPAS)source='asm.S' object='libde265_arm_neon_la-asm.lo' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCCAS_FALSE@ DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCCAS_FALSE@ $(AM_V_CPPAS@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_arm_neon_la_CCASFLAGS) $(CCASFLAGS) -c -o libde265_arm_neon_la-asm.lo `test -f 'asm.S' || echo '$(srcdir)/'`asm.S + +libde265_arm_neon_la-cpudetect.lo: cpudetect.S +@am__fastdepCCAS_TRUE@ $(AM_V_CPPAS)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_arm_neon_la_CCASFLAGS) $(CCASFLAGS) -MT libde265_arm_neon_la-cpudetect.lo -MD -MP -MF $(DEPDIR)/libde265_arm_neon_la-cpudetect.Tpo -c -o libde265_arm_neon_la-cpudetect.lo `test -f 'cpudetect.S' || echo '$(srcdir)/'`cpudetect.S +@am__fastdepCCAS_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_arm_neon_la-cpudetect.Tpo $(DEPDIR)/libde265_arm_neon_la-cpudetect.Plo +@AMDEP_TRUE@@am__fastdepCCAS_FALSE@ $(AM_V_CPPAS)source='cpudetect.S' object='libde265_arm_neon_la-cpudetect.lo' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCCAS_FALSE@ DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCCAS_FALSE@ $(AM_V_CPPAS@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_arm_neon_la_CCASFLAGS) $(CCASFLAGS) -c -o libde265_arm_neon_la-cpudetect.lo `test -f 'cpudetect.S' || echo '$(srcdir)/'`cpudetect.S + +libde265_arm_neon_la-hevcdsp_qpel_neon.lo: hevcdsp_qpel_neon.S +@am__fastdepCCAS_TRUE@ $(AM_V_CPPAS)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_arm_neon_la_CCASFLAGS) $(CCASFLAGS) -MT libde265_arm_neon_la-hevcdsp_qpel_neon.lo -MD -MP -MF $(DEPDIR)/libde265_arm_neon_la-hevcdsp_qpel_neon.Tpo -c -o libde265_arm_neon_la-hevcdsp_qpel_neon.lo `test -f 'hevcdsp_qpel_neon.S' || echo '$(srcdir)/'`hevcdsp_qpel_neon.S +@am__fastdepCCAS_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_arm_neon_la-hevcdsp_qpel_neon.Tpo $(DEPDIR)/libde265_arm_neon_la-hevcdsp_qpel_neon.Plo +@AMDEP_TRUE@@am__fastdepCCAS_FALSE@ $(AM_V_CPPAS)source='hevcdsp_qpel_neon.S' object='libde265_arm_neon_la-hevcdsp_qpel_neon.lo' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCCAS_FALSE@ DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCCAS_FALSE@ $(AM_V_CPPAS@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_arm_neon_la_CCASFLAGS) $(CCASFLAGS) -c -o libde265_arm_neon_la-hevcdsp_qpel_neon.lo `test -f 'hevcdsp_qpel_neon.S' || echo '$(srcdir)/'`hevcdsp_qpel_neon.S + +libde265_arm_neon_la-neon.lo: neon.S +@am__fastdepCCAS_TRUE@ $(AM_V_CPPAS)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_arm_neon_la_CCASFLAGS) $(CCASFLAGS) -MT libde265_arm_neon_la-neon.lo -MD -MP -MF $(DEPDIR)/libde265_arm_neon_la-neon.Tpo -c -o libde265_arm_neon_la-neon.lo `test -f 'neon.S' || echo '$(srcdir)/'`neon.S +@am__fastdepCCAS_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_arm_neon_la-neon.Tpo $(DEPDIR)/libde265_arm_neon_la-neon.Plo +@AMDEP_TRUE@@am__fastdepCCAS_FALSE@ $(AM_V_CPPAS)source='neon.S' object='libde265_arm_neon_la-neon.lo' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCCAS_FALSE@ DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCCAS_FALSE@ $(AM_V_CPPAS@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_arm_neon_la_CCASFLAGS) $(CCASFLAGS) -c -o libde265_arm_neon_la-neon.lo `test -f 'neon.S' || echo '$(srcdir)/'`neon.S + +.cc.o: +@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< +@am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(CXXCOMPILE) -c -o $@ $< + +.cc.obj: +@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'` +@am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(CXXCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'` + +.cc.lo: +@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(LTCXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< +@am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(LTCXXCOMPILE) -c -o $@ $< + +libde265_arm_la-arm.lo: arm.cc +@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_arm_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_arm_la-arm.lo -MD -MP -MF $(DEPDIR)/libde265_arm_la-arm.Tpo -c -o libde265_arm_la-arm.lo `test -f 'arm.cc' || echo '$(srcdir)/'`arm.cc +@am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_arm_la-arm.Tpo $(DEPDIR)/libde265_arm_la-arm.Plo +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='arm.cc' object='libde265_arm_la-arm.lo' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_arm_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_arm_la-arm.lo `test -f 'arm.cc' || echo '$(srcdir)/'`arm.cc + +mostlyclean-libtool: + -rm -f *.lo + +clean-libtool: + -rm -rf .libs _libs + +ID: $(am__tagged_files) + $(am__define_uniq_tagged_files); mkid -fID $$unique +tags: tags-am +TAGS: tags + +tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files) + set x; \ + here=`pwd`; \ + $(am__define_uniq_tagged_files); \ + shift; \ + if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \ + test -n "$$unique" || unique=$$empty_fix; \ + if test $$# -gt 0; then \ + $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ + "$$@" $$unique; \ + else \ + $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ + $$unique; \ + fi; \ + fi +ctags: ctags-am + +CTAGS: ctags +ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files) + $(am__define_uniq_tagged_files); \ + test -z "$(CTAGS_ARGS)$$unique" \ + || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ + $$unique + +GTAGS: + here=`$(am__cd) $(top_builddir) && pwd` \ + && $(am__cd) $(top_srcdir) \ + && gtags -i $(GTAGS_ARGS) "$$here" +cscopelist: cscopelist-am + +cscopelist-am: $(am__tagged_files) + list='$(am__tagged_files)'; \ + case "$(srcdir)" in \ + [\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \ + *) sdir=$(subdir)/$(srcdir) ;; \ + esac; \ + for i in $$list; do \ + if test -f "$$i"; then \ + echo "$(subdir)/$$i"; \ + else \ + echo "$$sdir/$$i"; \ + fi; \ + done >> $(top_builddir)/cscope.files + +distclean-tags: + -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags + +distdir: $(BUILT_SOURCES) + $(MAKE) $(AM_MAKEFLAGS) distdir-am + +distdir-am: $(DISTFILES) + @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ + topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ + list='$(DISTFILES)'; \ + dist_files=`for file in $$list; do echo $$file; done | \ + sed -e "s|^$$srcdirstrip/||;t" \ + -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \ + case $$dist_files in \ + */*) $(MKDIR_P) `echo "$$dist_files" | \ + sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \ + sort -u` ;; \ + esac; \ + for file in $$dist_files; do \ + if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ + if test -d $$d/$$file; then \ + dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \ + if test -d "$(distdir)/$$file"; then \ + find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ + fi; \ + if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ + cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \ + find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ + fi; \ + cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \ + else \ + test -f "$(distdir)/$$file" \ + || cp -p $$d/$$file "$(distdir)/$$file" \ + || exit 1; \ + fi; \ + done +check-am: all-am +check: check-am +all-am: Makefile $(LTLIBRARIES) +installdirs: +install: install-am +install-exec: install-exec-am +install-data: install-data-am +uninstall: uninstall-am + +install-am: all-am + @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am + +installcheck: installcheck-am +install-strip: + if test -z '$(STRIP)'; then \ + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + install; \ + else \ + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \ + fi +mostlyclean-generic: + +clean-generic: + +distclean-generic: + -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) + -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES) + +maintainer-clean-generic: + @echo "This command is intended for maintainers to use" + @echo "it deletes files that may require special tools to rebuild." +clean: clean-am + +clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \ + mostlyclean-am + +distclean: distclean-am + -rm -f ./$(DEPDIR)/libde265_arm_la-arm.Plo + -rm -f ./$(DEPDIR)/libde265_arm_neon_la-asm.Plo + -rm -f ./$(DEPDIR)/libde265_arm_neon_la-cpudetect.Plo + -rm -f ./$(DEPDIR)/libde265_arm_neon_la-hevcdsp_qpel_neon.Plo + -rm -f ./$(DEPDIR)/libde265_arm_neon_la-neon.Plo + -rm -f Makefile +distclean-am: clean-am distclean-compile distclean-generic \ + distclean-tags + +dvi: dvi-am + +dvi-am: + +html: html-am + +html-am: + +info: info-am + +info-am: + +install-data-am: + +install-dvi: install-dvi-am + +install-dvi-am: + +install-exec-am: + +install-html: install-html-am + +install-html-am: + +install-info: install-info-am + +install-info-am: + +install-man: + +install-pdf: install-pdf-am + +install-pdf-am: + +install-ps: install-ps-am + +install-ps-am: + +installcheck-am: + +maintainer-clean: maintainer-clean-am + -rm -f ./$(DEPDIR)/libde265_arm_la-arm.Plo + -rm -f ./$(DEPDIR)/libde265_arm_neon_la-asm.Plo + -rm -f ./$(DEPDIR)/libde265_arm_neon_la-cpudetect.Plo + -rm -f ./$(DEPDIR)/libde265_arm_neon_la-hevcdsp_qpel_neon.Plo + -rm -f ./$(DEPDIR)/libde265_arm_neon_la-neon.Plo + -rm -f Makefile +maintainer-clean-am: distclean-am maintainer-clean-generic + +mostlyclean: mostlyclean-am + +mostlyclean-am: mostlyclean-compile mostlyclean-generic \ + mostlyclean-libtool + +pdf: pdf-am + +pdf-am: + +ps: ps-am + +ps-am: + +uninstall-am: + +.MAKE: install-am install-strip + +.PHONY: CTAGS GTAGS TAGS all all-am am--depfiles check check-am clean \ + clean-generic clean-libtool clean-noinstLTLIBRARIES \ + cscopelist-am ctags ctags-am distclean distclean-compile \ + distclean-generic distclean-libtool distclean-tags distdir dvi \ + dvi-am html html-am info info-am install install-am \ + install-data install-data-am install-dvi install-dvi-am \ + install-exec install-exec-am install-html install-html-am \ + install-info install-info-am install-man install-pdf \ + install-pdf-am install-ps install-ps-am install-strip \ + installcheck installcheck-am installdirs maintainer-clean \ + maintainer-clean-generic mostlyclean mostlyclean-compile \ + mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \ + tags tags-am uninstall uninstall-am + +.PRECIOUS: Makefile + + +@HAVE_VISIBILITY_TRUE@ libde265_arm_la_CXXFLAGS += -DHAVE_VISIBILITY + +@ENABLE_ARM_THUMB_TRUE@@ENABLE_NEON_OPT_TRUE@ libde265_arm_neon_la_CCASFLAGS += -DCONFIG_THUMB + +@ENABLE_NEON_OPT_TRUE@@HAVE_VISIBILITY_TRUE@ libde265_arm_neon_la_CXXFLAGS += -DHAVE_VISIBILITY + +# Tell versions [3.59,3.63) of GNU make to not export all variables. +# Otherwise a system limit (for SysV at least) may be exceeded. +.NOEXPORT: diff --git a/arm/arm.cc b/arm/arm.cc new file mode 100644 index 0000000..9791f15 --- /dev/null +++ b/arm/arm.cc @@ -0,0 +1,123 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2015 struktur AG, Joachim Bauch + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "arm.h" + +#ifdef HAVE_NEON + +#define QPEL_FUNC(name) \ + extern "C" void ff_##name(int16_t *dst, ptrdiff_t dststride, const uint8_t *src, ptrdiff_t srcstride, \ + int height, int width); \ + void libde265_##name(int16_t *dst, ptrdiff_t dststride, const uint8_t *src, ptrdiff_t srcstride, \ + int width, int height, int16_t* mcbuffer) { \ + ff_##name(dst, dststride, src, srcstride, height, width); \ + } + +QPEL_FUNC(hevc_put_qpel_v1_neon_8); +QPEL_FUNC(hevc_put_qpel_v2_neon_8); +QPEL_FUNC(hevc_put_qpel_v3_neon_8); +QPEL_FUNC(hevc_put_qpel_h1_neon_8); +QPEL_FUNC(hevc_put_qpel_h2_neon_8); +QPEL_FUNC(hevc_put_qpel_h3_neon_8); +QPEL_FUNC(hevc_put_qpel_h1v1_neon_8); +QPEL_FUNC(hevc_put_qpel_h1v2_neon_8); +QPEL_FUNC(hevc_put_qpel_h1v3_neon_8); +QPEL_FUNC(hevc_put_qpel_h2v1_neon_8); +QPEL_FUNC(hevc_put_qpel_h2v2_neon_8); +QPEL_FUNC(hevc_put_qpel_h2v3_neon_8); +QPEL_FUNC(hevc_put_qpel_h3v1_neon_8); +QPEL_FUNC(hevc_put_qpel_h3v2_neon_8); +QPEL_FUNC(hevc_put_qpel_h3v3_neon_8); +#undef QPEL_FUNC + +#if defined(HAVE_SIGNAL_H) && defined(HAVE_SETJMP_H) + +#include +#include + +extern "C" void libde265_detect_neon(void); + +static jmp_buf jump_env; + +static void sighandler(int sig) { + (void)sig; + longjmp(jump_env, 1); +} + +static bool has_NEON() { + static bool checked_NEON = false; + static bool have_NEON = false; + + if (!checked_NEON) { + void (*oldsignal)(int); + + checked_NEON = true; + oldsignal = signal(SIGILL, sighandler); + if (setjmp(jump_env)) { + signal(SIGILL, oldsignal); + have_NEON = false; + return false; + } + libde265_detect_neon(); + signal(SIGILL, oldsignal); + have_NEON = true; + } + + return have_NEON; +} + +#else // #if defined(HAVE_SIGNAL_H) && defined(HAVE_SETJMP_H) + +#warning "Don't know how to detect NEON support at runtime- will be disabled" + +static bool has_NEON() { + return false; +} + +#endif + +#endif // #ifdef HAVE_NEON + +void init_acceleration_functions_arm(struct acceleration_functions* accel) +{ +#ifdef HAVE_NEON + if (has_NEON()) { + accel->put_hevc_qpel_8[0][1] = libde265_hevc_put_qpel_v1_neon_8; + accel->put_hevc_qpel_8[0][2] = libde265_hevc_put_qpel_v2_neon_8; + accel->put_hevc_qpel_8[0][3] = libde265_hevc_put_qpel_v3_neon_8; + accel->put_hevc_qpel_8[1][0] = libde265_hevc_put_qpel_h1_neon_8; + accel->put_hevc_qpel_8[1][1] = libde265_hevc_put_qpel_h1v1_neon_8; + accel->put_hevc_qpel_8[1][2] = libde265_hevc_put_qpel_h1v2_neon_8; + accel->put_hevc_qpel_8[1][3] = libde265_hevc_put_qpel_h1v3_neon_8; + accel->put_hevc_qpel_8[2][0] = libde265_hevc_put_qpel_h2_neon_8; + accel->put_hevc_qpel_8[2][1] = libde265_hevc_put_qpel_h2v1_neon_8; + accel->put_hevc_qpel_8[2][2] = libde265_hevc_put_qpel_h2v2_neon_8; + accel->put_hevc_qpel_8[2][3] = libde265_hevc_put_qpel_h2v3_neon_8; + accel->put_hevc_qpel_8[3][0] = libde265_hevc_put_qpel_h3_neon_8; + accel->put_hevc_qpel_8[3][1] = libde265_hevc_put_qpel_h3v1_neon_8; + accel->put_hevc_qpel_8[3][2] = libde265_hevc_put_qpel_h3v2_neon_8; + accel->put_hevc_qpel_8[3][3] = libde265_hevc_put_qpel_h3v3_neon_8; + } +#endif // #ifdef HAVE_NEON +} diff --git a/arm/arm.h b/arm/arm.h new file mode 100644 index 0000000..d64172a --- /dev/null +++ b/arm/arm.h @@ -0,0 +1,28 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2015 struktur AG, Joachim Bauch + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef LIBDE265_ARM_H +#define LIBDE265_ARM_H + +#include "acceleration.h" + +void init_acceleration_functions_arm(struct acceleration_functions* accel); + +#endif // LIBDE265_ARM_H diff --git a/arm/asm.S b/arm/asm.S new file mode 100644 index 0000000..1d0e5a9 --- /dev/null +++ b/arm/asm.S @@ -0,0 +1,325 @@ +/* + * Copyright (c) 2008 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" + +#ifdef __ELF__ +# define ELF +#else +# define ELF @ +#endif + +#if CONFIG_THUMB +# define A @ +# define T +#else +# define A +# define T @ +#endif + +#if HAVE_AS_FUNC +# define FUNC +#else +# define FUNC @ +#endif + +#if HAVE_NEON + .arch armv7-a +#elif HAVE_ARMV6T2 + .arch armv6t2 +#elif HAVE_ARMV6 + .arch armv6 +#elif HAVE_ARMV5TE + .arch armv5te +#endif + +#if HAVE_NEON + .fpu neon +#elif HAVE_VFP + .fpu vfp +#endif + + .syntax unified +T .thumb +ELF .eabi_attribute 25, 1 @ Tag_ABI_align_preserved +ELF .section .note.GNU-stack,"",%progbits @ Mark stack as non-executable + +.macro function name, export=0, align=2 + .set .Lpic_idx, 0 + .set .Lpic_gp, 0 + .macro endfunc + .if .Lpic_idx + .align 2 + .altmacro + put_pic %(.Lpic_idx - 1) + .noaltmacro + .endif +ELF .size \name, . - \name +FUNC .endfunc + .purgem endfunc + .endm + .text + .align \align + .if \export + .global EXTERN_ASM\name +ELF .type EXTERN_ASM\name, %function +FUNC .func EXTERN_ASM\name +EXTERN_ASM\name: + .else +ELF .type \name, %function +FUNC .func \name +\name: + .endif +.endm + +.macro const name, align=2, relocate=0 + .macro endconst +ELF .size \name, . - \name + .purgem endconst + .endm +.if HAVE_SECTION_DATA_REL_RO && \relocate + .section .data.rel.ro +.else + .section .rodata +.endif + .align \align +\name: +.endm + +#if !HAVE_ARMV6T2_EXTERNAL +.macro movw rd, val + mov \rd, \val & 255 + orr \rd, \val & ~255 +.endm +#endif + +.macro mov32 rd, val +#if HAVE_ARMV6T2_EXTERNAL + movw \rd, #(\val) & 0xffff + .if (\val) >> 16 + movt \rd, #(\val) >> 16 + .endif +#else + ldr \rd, =\val +#endif +.endm + +.macro put_pic num + put_pic_\num +.endm + +.macro do_def_pic num, val, label + .macro put_pic_\num + .if \num + .altmacro + put_pic %(\num - 1) + .noaltmacro + .endif +\label: .word \val + .purgem put_pic_\num + .endm +.endm + +.macro def_pic val, label + .altmacro + do_def_pic %.Lpic_idx, \val, \label + .noaltmacro + .set .Lpic_idx, .Lpic_idx + 1 +.endm + +.macro ldpic rd, val, indir=0 + ldr \rd, .Lpicoff\@ +.Lpic\@: + .if \indir +A ldr \rd, [pc, \rd] +T add \rd, pc +T ldr \rd, [\rd] + .else + add \rd, pc + .endif + def_pic \val - (.Lpic\@ + (8 >> CONFIG_THUMB)), .Lpicoff\@ +.endm + +.macro movrel rd, val +#if CONFIG_PIC + ldpic \rd, \val +#elif HAVE_ARMV6T2_EXTERNAL && !defined(__APPLE__) + movw \rd, #:lower16:\val + movt \rd, #:upper16:\val +#else + ldr \rd, =\val +#endif +.endm + +.macro movrelx rd, val, gp +#if CONFIG_PIC && defined(__ELF__) + .ifnb \gp + .if .Lpic_gp + .unreq gp + .endif + gp .req \gp + ldpic gp, _GLOBAL_OFFSET_TABLE_ + .elseif !.Lpic_gp + gp .req r12 + ldpic gp, _GLOBAL_OFFSET_TABLE_ + .endif + .set .Lpic_gp, 1 + ldr \rd, .Lpicoff\@ + ldr \rd, [gp, \rd] + def_pic \val(GOT), .Lpicoff\@ +#elif CONFIG_PIC && defined(__APPLE__) + ldpic \rd, .Lpic\@, indir=1 + .non_lazy_symbol_pointer +.Lpic\@: + .indirect_symbol \val + .word 0 + .text +#else + movrel \rd, \val +#endif +.endm + +.macro add_sh rd, rn, rm, sh:vararg +A add \rd, \rn, \rm, \sh +T mov \rm, \rm, \sh +T add \rd, \rn, \rm +.endm + +.macro ldr_pre rt, rn, rm:vararg +A ldr \rt, [\rn, \rm]! +T add \rn, \rn, \rm +T ldr \rt, [\rn] +.endm + +.macro ldr_dpre rt, rn, rm:vararg +A ldr \rt, [\rn, -\rm]! +T sub \rn, \rn, \rm +T ldr \rt, [\rn] +.endm + +.macro ldr_nreg rt, rn, rm:vararg +A ldr \rt, [\rn, -\rm] +T sub \rt, \rn, \rm +T ldr \rt, [\rt] +.endm + +.macro ldr_post rt, rn, rm:vararg +A ldr \rt, [\rn], \rm +T ldr \rt, [\rn] +T add \rn, \rn, \rm +.endm + +.macro ldrc_pre cc, rt, rn, rm:vararg +A ldr\cc \rt, [\rn, \rm]! +T itt \cc +T add\cc \rn, \rn, \rm +T ldr\cc \rt, [\rn] +.endm + +.macro ldrd_reg rt, rt2, rn, rm +A ldrd \rt, \rt2, [\rn, \rm] +T add \rt, \rn, \rm +T ldrd \rt, \rt2, [\rt] +.endm + +.macro ldrd_post rt, rt2, rn, rm +A ldrd \rt, \rt2, [\rn], \rm +T ldrd \rt, \rt2, [\rn] +T add \rn, \rn, \rm +.endm + +.macro ldrh_pre rt, rn, rm +A ldrh \rt, [\rn, \rm]! +T add \rn, \rn, \rm +T ldrh \rt, [\rn] +.endm + +.macro ldrh_dpre rt, rn, rm +A ldrh \rt, [\rn, -\rm]! +T sub \rn, \rn, \rm +T ldrh \rt, [\rn] +.endm + +.macro ldrh_post rt, rn, rm +A ldrh \rt, [\rn], \rm +T ldrh \rt, [\rn] +T add \rn, \rn, \rm +.endm + +.macro ldrb_post rt, rn, rm +A ldrb \rt, [\rn], \rm +T ldrb \rt, [\rn] +T add \rn, \rn, \rm +.endm + +.macro str_post rt, rn, rm:vararg +A str \rt, [\rn], \rm +T str \rt, [\rn] +T add \rn, \rn, \rm +.endm + +.macro strb_post rt, rn, rm:vararg +A strb \rt, [\rn], \rm +T strb \rt, [\rn] +T add \rn, \rn, \rm +.endm + +.macro strd_post rt, rt2, rn, rm +A strd \rt, \rt2, [\rn], \rm +T strd \rt, \rt2, [\rn] +T add \rn, \rn, \rm +.endm + +.macro strh_pre rt, rn, rm +A strh \rt, [\rn, \rm]! +T add \rn, \rn, \rm +T strh \rt, [\rn] +.endm + +.macro strh_dpre rt, rn, rm +A strh \rt, [\rn, -\rm]! +T sub \rn, \rn, \rm +T strh \rt, [\rn] +.endm + +.macro strh_post rt, rn, rm +A strh \rt, [\rn], \rm +T strh \rt, [\rn] +T add \rn, \rn, \rm +.endm + +.macro strh_dpost rt, rn, rm +A strh \rt, [\rn], -\rm +T strh \rt, [\rn] +T sub \rn, \rn, \rm +.endm + +#if HAVE_VFP_ARGS +ELF .eabi_attribute 28, 1 +# define VFP +# define NOVFP @ +#else +# define VFP @ +# define NOVFP +#endif + +#define GLUE(a, b) a ## b +#define JOIN(a, b) GLUE(a, b) +#define X(s) JOIN(EXTERN_ASM, s) diff --git a/arm/cpudetect.S b/arm/cpudetect.S new file mode 100644 index 0000000..45600a8 --- /dev/null +++ b/arm/cpudetect.S @@ -0,0 +1,29 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2015 struktur AG, Joachim Bauch + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "asm.S" +#include "neon.S" + +// we execute a simple NEON instruction and check if SIGILL is triggered to +// detect if the CPU support NEON code +function libde265_detect_neon, export=1 + vand q0, q0, q0 + bx lr +endfunc diff --git a/arm/hevcdsp_qpel_neon.S b/arm/hevcdsp_qpel_neon.S new file mode 100644 index 0000000..4e438a9 --- /dev/null +++ b/arm/hevcdsp_qpel_neon.S @@ -0,0 +1,1004 @@ +/* + * Copyright (c) 2014 - 2015 Seppo Tomperi + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * This is commit 63ca0fe8288dbd300c9bb814cb671e5d889f691c from + * https://github.com/FFmpeg/FFmpeg/blob/master/libavcodec/arm/hevcdsp_qpel_neon.S + */ + +#include "asm.S" +#include "neon.S" + +#define MAX_PB_SIZE #64 + +.macro regshuffle_d8 + vmov d16, d17 + vmov d17, d18 + vmov d18, d19 + vmov d19, d20 + vmov d20, d21 + vmov d21, d22 + vmov d22, d23 +.endm + +.macro regshuffle_q8 + vmov q0, q1 + vmov q1, q2 + vmov q2, q3 + vmov q3, q4 + vmov q4, q5 + vmov q5, q6 + vmov q6, q7 +.endm + +.macro vextin8 + pld [r2] + vld1.8 {q11}, [r2], r3 + vext.8 d16, d22, d23, #1 + vext.8 d17, d22, d23, #2 + vext.8 d18, d22, d23, #3 + vext.8 d19, d22, d23, #4 + vext.8 d20, d22, d23, #5 + vext.8 d21, d22, d23, #6 + vext.8 d22, d22, d23, #7 +.endm + +.macro loadin8 + pld [r2] + vld1.8 {d16}, [r2], r3 + pld [r2] + vld1.8 {d17}, [r2], r3 + pld [r2] + vld1.8 {d18}, [r2], r3 + pld [r2] + vld1.8 {d19}, [r2], r3 + pld [r2] + vld1.8 {d20}, [r2], r3 + pld [r2] + vld1.8 {d21}, [r2], r3 + pld [r2] + vld1.8 {d22}, [r2], r3 + pld [r2] + vld1.8 {d23}, [r2], r3 +.endm + +.macro qpel_filter_1_32b + vmov.i16 d16, #58 + vmov.i16 d17, #10 + vmull.s16 q9, d6, d16 // 58 * d0 + vmull.s16 q10, d7, d16 // 58 * d1 + vmov.i16 d16, #17 + vmull.s16 q11, d4, d17 // 10 * c0 + vmull.s16 q12, d5, d17 // 10 * c1 + vmov.i16 d17, #5 + vmull.s16 q13, d8, d16 // 17 * e0 + vmull.s16 q14, d9, d16 // 17 * e1 + vmull.s16 q15, d10, d17 // 5 * f0 + vmull.s16 q8, d11, d17 // 5 * f1 + vsub.s32 q9, q11 // 58 * d0 - 10 * c0 + vsub.s32 q10, q12 // 58 * d1 - 10 * c1 + vshll.s16 q11, d2, #2 // 4 * b0 + vshll.s16 q12, d3, #2 // 4 * b1 + vadd.s32 q9, q13 // 58 * d0 - 10 * c0 + 17 * e0 + vadd.s32 q10, q14 // 58 * d1 - 10 * c1 + 17 * e1 + vsubl.s16 q13, d12, d0 // g0 - a0 + vsubl.s16 q14, d13, d1 // g1 - a1 + vadd.s32 q9, q11 // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0 + vadd.s32 q10, q12 // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1 + vsub.s32 q13, q15 // g0 - a0 - 5 * f0 + vsub.s32 q14, q8 // g1 - a1 - 5 * f1 + vadd.s32 q9, q13 // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0 + g0 - a0 - 5 * f0 + vadd.s32 q10, q14 // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1 + g1 - a1 - 5 * f1 + vqshrn.s32 d16, q9, #6 + vqshrn.s32 d17, q10, #6 +.endm + +// input q0 - q7 +// output q8 +.macro qpel_filter_2_32b + vmov.i32 q8, #11 + vaddl.s16 q9, d6, d8 // d0 + e0 + vaddl.s16 q10, d7, d9 // d1 + e1 + vaddl.s16 q11, d4, d10 // c0 + f0 + vaddl.s16 q12, d5, d11 // c1 + f1 + vmul.s32 q11, q8 // 11 * (c0 + f0) + vmul.s32 q12, q8 // 11 * (c1 + f1) + vmov.i32 q8, #40 + vaddl.s16 q15, d2, d12 // b0 + g0 + vmul.s32 q9, q8 // 40 * (d0 + e0) + vmul.s32 q10, q8 // 40 * (d1 + e1) + vaddl.s16 q8, d3, d13 // b1 + g1 + vaddl.s16 q13, d0, d14 // a0 + h0 + vaddl.s16 q14, d1, d15 // a1 + h1 + vshl.s32 q15, #2 // 4*(b0+g0) + vshl.s32 q8, #2 // 4*(b1+g1) + vadd.s32 q11, q13 // 11 * (c0 + f0) + a0 + h0 + vadd.s32 q12, q14 // 11 * (c1 + f1) + a1 + h1 + vadd.s32 q9, q15 // 40 * (d0 + e0) + 4*(b0+g0) + vadd.s32 q10, q8 // 40 * (d1 + e1) + 4*(b1+g1) + vsub.s32 q9, q11 // 40 * (d0 + e0) + 4*(b0+g0) - (11 * (c0 + f0) + a0 + h0) + vsub.s32 q10, q12 // 40 * (d1 + e1) + 4*(b1+g1) - (11 * (c1 + f1) + a1 + h1) + vqshrn.s32 d16, q9, #6 + vqshrn.s32 d17, q10, #6 +.endm + +.macro qpel_filter_3_32b + vmov.i16 d16, #58 + vmov.i16 d17, #10 + vmull.s16 q9, d8, d16 // 58 * d0 + vmull.s16 q10, d9, d16 // 58 * d1 + vmov.i16 d16, #17 + vmull.s16 q11, d10, d17 // 10 * c0 + vmull.s16 q12, d11, d17 // 10 * c1 + vmov.i16 d17, #5 + vmull.s16 q13, d6, d16 // 17 * e0 + vmull.s16 q14, d7, d16 // 17 * e1 + vmull.s16 q15, d4, d17 // 5 * f0 + vmull.s16 q8, d5, d17 // 5 * f1 + vsub.s32 q9, q11 // 58 * d0 - 10 * c0 + vsub.s32 q10, q12 // 58 * d1 - 10 * c1 + vshll.s16 q11, d12, #2 // 4 * b0 + vshll.s16 q12, d13, #2 // 4 * b1 + vadd.s32 q9, q13 // 58 * d0 - 10 * c0 + 17 * e0 + vadd.s32 q10, q14 // 58 * d1 - 10 * c1 + 17 * e1 + vsubl.s16 q13, d2, d14 // g0 - a0 + vsubl.s16 q14, d3, d15 // g1 - a1 + vadd.s32 q9, q11 // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0 + vadd.s32 q10, q12 // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1 + vsub.s32 q13, q15 // g0 - a0 - 5 * f0 + vsub.s32 q14, q8 // g1 - a1 - 5 * f1 + vadd.s32 q9, q13 // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0 + g0 - a0 - 5 * f0 + vadd.s32 q10, q14 // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1 + g1 - a1 - 5 * f1 + vqshrn.s32 d16, q9, #6 + vqshrn.s32 d17, q10, #6 +.endm + +.macro qpel_filter_1 out=q7 + vmov.u8 d24, #58 + vmov.u8 d25, #10 + vshll.u8 q13, d20, #4 // 16*e + vshll.u8 q14, d21, #2 // 4*f + vmull.u8 \out, d19, d24 // 58*d + vaddw.u8 q13, q13, d20 // 17*e + vmull.u8 q15, d18, d25 // 10*c + vaddw.u8 q14, q14, d21 // 5*f + vsubl.u8 q12, d22, d16 // g - a + vadd.u16 \out, q13 // 58d + 17e + vshll.u8 q13, d17, #2 // 4*b + vadd.u16 q15, q14 // 10*c + 5*f + vadd.s16 q13, q12 // - a + 4*b + g + vsub.s16 \out, q15 // -10*c + 58*d + 17*e -5*f + vadd.s16 \out, q13 // -a + 4*b -10*c + 58*d + 17*e -5*f +.endm + +.macro qpel_filter_2 out=q7 + vmov.i16 q12, #10 + vmov.i16 q14, #11 + vaddl.u8 q13, d19, d20 // d + e + vaddl.u8 q15, d18, d21 // c + f + vmul.u16 q13, q12 // 10 * (d+e) + vmul.u16 q15, q14 // 11 * ( c + f) + vaddl.u8 \out, d17, d22 // b + g + vaddl.u8 q12, d16, d23 // a + h + vadd.u16 \out, q13 // b + 10 * (d + e) + g + vadd.s16 q12, q15 + vshl.u16 \out, #2 // 4 * (b + 10 * (d + e) + g) + vsub.s16 \out, q12 +.endm + +.macro qpel_filter_3 out=q7 + vmov.u8 d24, #58 + vmov.u8 d25, #10 + vshll.u8 q13, d19, #4 // 16*e + vshll.u8 q14, d18, #2 // 4*f + vmull.u8 \out, d20, d24 // 58*d + vaddw.u8 q13, q13, d19 // 17*e + vmull.u8 q15, d21, d25 // 10*c + vaddw.u8 q14, q14, d18 // 5*f + vsubl.u8 q12, d17, d23 // g - a + vadd.u16 \out, q13 // 58d + 17e + vshll.u8 q13, d22, #2 // 4*b + vadd.u16 q15, q14 // 10*c + 5*f + vadd.s16 q13, q12 // - a + 4*b + g + vsub.s16 \out, q15 // -10*c + 58*d + 17*e -5*f + vadd.s16 \out, q13 // -a + 4*b -10*c + 58*d + 17*e -5*f +.endm + +.macro hevc_put_qpel_vX_neon_8 filter + push {r4, r5, r6, r7} + ldr r4, [sp, #16] // height + ldr r5, [sp, #20] // width + vpush {d8-d15} + sub r2, r2, r3, lsl #1 + sub r2, r3 + mov r12, r4 + mov r6, r0 + mov r7, r2 + lsl r1, #1 +0: loadin8 + cmp r5, #4 + beq 4f +8: subs r4, #1 + \filter + vst1.16 {q7}, [r0], r1 + regshuffle_d8 + vld1.8 {d23}, [r2], r3 + bne 8b + subs r5, #8 + beq 99f + mov r4, r12 + add r6, #16 + mov r0, r6 + add r7, #8 + mov r2, r7 + b 0b +4: subs r4, #1 + \filter + vst1.16 d14, [r0], r1 + regshuffle_d8 + vld1.32 {d23[0]}, [r2], r3 + bne 4b +99: vpop {d8-d15} + pop {r4, r5, r6, r7} + bx lr +.endm + +.macro hevc_put_qpel_uw_vX_neon_8 filter + push {r4-r10} + ldr r5, [sp, #28] // width + ldr r4, [sp, #32] // height + ldr r8, [sp, #36] // src2 + ldr r9, [sp, #40] // src2stride + vpush {d8-d15} + sub r2, r2, r3, lsl #1 + sub r2, r3 + mov r12, r4 + mov r6, r0 + mov r7, r2 + cmp r8, #0 + bne .Lbi\@ +0: loadin8 + cmp r5, #4 + beq 4f +8: subs r4, #1 + \filter + vqrshrun.s16 d0, q7, #6 + vst1.8 d0, [r0], r1 + regshuffle_d8 + vld1.8 {d23}, [r2], r3 + bne 8b + subs r5, #8 + beq 99f + mov r4, r12 + add r6, #8 + mov r0, r6 + add r7, #8 + mov r2, r7 + b 0b +4: subs r4, #1 + \filter + vqrshrun.s16 d0, q7, #6 + vst1.32 d0[0], [r0], r1 + regshuffle_d8 + vld1.32 {d23[0]}, [r2], r3 + bne 4b + b 99f +.Lbi\@: lsl r9, #1 + mov r10, r8 +0: loadin8 + cmp r5, #4 + beq 4f +8: subs r4, #1 + \filter + vld1.16 {q0}, [r8], r9 + vqadd.s16 q0, q7 + vqrshrun.s16 d0, q0, #7 + vst1.8 d0, [r0], r1 + regshuffle_d8 + vld1.8 {d23}, [r2], r3 + bne 8b + subs r5, #8 + beq 99f + mov r4, r12 + add r6, #8 + mov r0, r6 + add r10, #16 + mov r8, r10 + add r7, #8 + mov r2, r7 + b 0b +4: subs r4, #1 + \filter + vld1.16 d0, [r8], r9 + vqadd.s16 d0, d14 + vqrshrun.s16 d0, q0, #7 + vst1.32 d0[0], [r0], r1 + regshuffle_d8 + vld1.32 {d23[0]}, [r2], r3 + bne 4b +99: vpop {d8-d15} + pop {r4-r10} + bx lr +.endm + +function ff_hevc_put_qpel_v1_neon_8, export=1 + hevc_put_qpel_vX_neon_8 qpel_filter_1 +endfunc + +function ff_hevc_put_qpel_v2_neon_8, export=1 + hevc_put_qpel_vX_neon_8 qpel_filter_2 +endfunc + +function ff_hevc_put_qpel_v3_neon_8, export=1 + hevc_put_qpel_vX_neon_8 qpel_filter_3 +endfunc + + +function ff_hevc_put_qpel_uw_v1_neon_8, export=1 + hevc_put_qpel_uw_vX_neon_8 qpel_filter_1 +endfunc + +function ff_hevc_put_qpel_uw_v2_neon_8, export=1 + hevc_put_qpel_uw_vX_neon_8 qpel_filter_2 +endfunc + +function ff_hevc_put_qpel_uw_v3_neon_8, export=1 + hevc_put_qpel_uw_vX_neon_8 qpel_filter_3 +endfunc + +.macro hevc_put_qpel_hX_neon_8 filter + push {r4, r5, r6, r7} + ldr r4, [sp, #16] // height + ldr r5, [sp, #20] // width + + vpush {d8-d15} + sub r2, #4 + lsl r1, #1 + mov r12, r4 + mov r6, r0 + mov r7, r2 + cmp r5, #4 + beq 4f +8: subs r4, #1 + vextin8 + \filter + vst1.16 {q7}, [r0], r1 + bne 8b + subs r5, #8 + beq 99f + mov r4, r12 + add r6, #16 + mov r0, r6 + add r7, #8 + mov r2, r7 + cmp r5, #4 + bne 8b +4: subs r4, #1 + vextin8 + \filter + vst1.16 d14, [r0], r1 + bne 4b +99: vpop {d8-d15} + pop {r4, r5, r6, r7} + bx lr +.endm + +.macro hevc_put_qpel_uw_hX_neon_8 filter + push {r4-r10} + ldr r5, [sp, #28] // width + ldr r4, [sp, #32] // height + ldr r8, [sp, #36] // src2 + ldr r9, [sp, #40] // src2stride + vpush {d8-d15} + sub r2, #4 + mov r12, r4 + mov r6, r0 + mov r7, r2 + cmp r8, #0 + bne .Lbi\@ + cmp r5, #4 + beq 4f +8: subs r4, #1 + vextin8 + \filter + vqrshrun.s16 d0, q7, #6 + vst1.8 d0, [r0], r1 + bne 8b + subs r5, #8 + beq 99f + mov r4, r12 + add r6, #8 + mov r0, r6 + add r7, #8 + mov r2, r7 + cmp r5, #4 + bne 8b +4: subs r4, #1 + vextin8 + \filter + vqrshrun.s16 d0, q7, #6 + vst1.32 d0[0], [r0], r1 + bne 4b + b 99f +.Lbi\@: + lsl r9, #1 + cmp r5, #4 + beq 4f + mov r10, r8 +8: subs r4, #1 + vextin8 + \filter + vld1.16 {q0}, [r8], r9 + vqadd.s16 q0, q7 + vqrshrun.s16 d0, q0, #7 + vst1.8 d0, [r0], r1 + bne 8b + subs r5, #8 + beq 99f + mov r4, r12 + add r6, #8 + add r10, #16 + mov r8, r10 + mov r0, r6 + add r7, #8 + mov r2, r7 + cmp r5, #4 + bne 8b +4: subs r4, #1 + vextin8 + \filter + vld1.16 d0, [r8], r9 + vqadd.s16 d0, d14 + vqrshrun.s16 d0, q0, #7 + vst1.32 d0[0], [r0], r1 + bne 4b +99: vpop {d8-d15} + pop {r4-r10} + bx lr +.endm + +function ff_hevc_put_qpel_h1_neon_8, export=1 + hevc_put_qpel_hX_neon_8 qpel_filter_1 +endfunc + +function ff_hevc_put_qpel_h2_neon_8, export=1 + hevc_put_qpel_hX_neon_8 qpel_filter_2 +endfunc + +function ff_hevc_put_qpel_h3_neon_8, export=1 + hevc_put_qpel_hX_neon_8 qpel_filter_3 +endfunc + + +function ff_hevc_put_qpel_uw_h1_neon_8, export=1 + hevc_put_qpel_uw_hX_neon_8 qpel_filter_1 +endfunc + +function ff_hevc_put_qpel_uw_h2_neon_8, export=1 + hevc_put_qpel_uw_hX_neon_8 qpel_filter_2 +endfunc + +function ff_hevc_put_qpel_uw_h3_neon_8, export=1 + hevc_put_qpel_uw_hX_neon_8 qpel_filter_3 +endfunc + +.macro hevc_put_qpel_hXvY_neon_8 filterh filterv + push {r4, r5, r6, r7} + ldr r4, [sp, #16] // height + ldr r5, [sp, #20] // width + + vpush {d8-d15} + sub r2, #4 + sub r2, r2, r3, lsl #1 + sub r2, r3 // extra_before 3 + lsl r1, #1 + mov r12, r4 + mov r6, r0 + mov r7, r2 +0: vextin8 + \filterh q0 + vextin8 + \filterh q1 + vextin8 + \filterh q2 + vextin8 + \filterh q3 + vextin8 + \filterh q4 + vextin8 + \filterh q5 + vextin8 + \filterh q6 + vextin8 + \filterh q7 + cmp r5, #4 + beq 4f +8: subs r4, #1 + \filterv + vst1.16 {q8}, [r0], r1 + regshuffle_q8 + vextin8 + \filterh q7 + bne 8b + subs r5, #8 + beq 99f + mov r4, r12 + add r6, #16 + mov r0, r6 + add r7, #8 + mov r2, r7 + b 0b +4: subs r4, #1 + \filterv + vst1.16 d16, [r0], r1 + regshuffle_q8 + vextin8 + \filterh q7 + bne 4b +99: vpop {d8-d15} + pop {r4, r5, r6, r7} + bx lr +.endm + +.macro hevc_put_qpel_uw_hXvY_neon_8 filterh filterv + push {r4-r10} + ldr r5, [sp, #28] // width + ldr r4, [sp, #32] // height + ldr r8, [sp, #36] // src2 + ldr r9, [sp, #40] // src2stride + vpush {d8-d15} + sub r2, #4 + sub r2, r2, r3, lsl #1 + sub r2, r3 // extra_before 3 + mov r12, r4 + mov r6, r0 + mov r7, r2 + cmp r8, #0 + bne .Lbi\@ +0: vextin8 + \filterh q0 + vextin8 + \filterh q1 + vextin8 + \filterh q2 + vextin8 + \filterh q3 + vextin8 + \filterh q4 + vextin8 + \filterh q5 + vextin8 + \filterh q6 + vextin8 + \filterh q7 + cmp r5, #4 + beq 4f +8: subs r4, #1 + \filterv + vqrshrun.s16 d0, q8, #6 + vst1.8 d0, [r0], r1 + regshuffle_q8 + vextin8 + \filterh q7 + bne 8b + subs r5, #8 + beq 99f + mov r4, r12 + add r6, #8 + mov r0, r6 + add r7, #8 + mov r2, r7 + b 0b +4: subs r4, #1 + \filterv + vqrshrun.s16 d0, q8, #6 + vst1.32 d0[0], [r0], r1 + regshuffle_q8 + vextin8 + \filterh q7 + bne 4b + b 99f +.Lbi\@: lsl r9, #1 + mov r10, r8 +0: vextin8 + \filterh q0 + vextin8 + \filterh q1 + vextin8 + \filterh q2 + vextin8 + \filterh q3 + vextin8 + \filterh q4 + vextin8 + \filterh q5 + vextin8 + \filterh q6 + vextin8 + \filterh q7 + cmp r5, #4 + beq 4f +8: subs r4, #1 + \filterv + vld1.16 {q0}, [r8], r9 + vqadd.s16 q0, q8 + vqrshrun.s16 d0, q0, #7 + vst1.8 d0, [r0], r1 + regshuffle_q8 + vextin8 + \filterh q7 + bne 8b + subs r5, #8 + beq 99f + mov r4, r12 + add r6, #8 + mov r0, r6 + add r10, #16 + mov r8, r10 + add r7, #8 + mov r2, r7 + b 0b +4: subs r4, #1 + \filterv + vld1.16 d0, [r8], r9 + vqadd.s16 d0, d16 + vqrshrun.s16 d0, q0, #7 + vst1.32 d0[0], [r0], r1 + regshuffle_q8 + vextin8 + \filterh q7 + bne 4b +99: vpop {d8-d15} + pop {r4-r10} + bx lr +.endm + + +function ff_hevc_put_qpel_h1v1_neon_8, export=1 + hevc_put_qpel_hXvY_neon_8 qpel_filter_1 qpel_filter_1_32b +endfunc + +function ff_hevc_put_qpel_h2v1_neon_8, export=1 + hevc_put_qpel_hXvY_neon_8 qpel_filter_2 qpel_filter_1_32b +endfunc + +function ff_hevc_put_qpel_h3v1_neon_8, export=1 + hevc_put_qpel_hXvY_neon_8 qpel_filter_3 qpel_filter_1_32b +endfunc + +function ff_hevc_put_qpel_h1v2_neon_8, export=1 + hevc_put_qpel_hXvY_neon_8 qpel_filter_1 qpel_filter_2_32b +endfunc + +function ff_hevc_put_qpel_h2v2_neon_8, export=1 + hevc_put_qpel_hXvY_neon_8 qpel_filter_2 qpel_filter_2_32b +endfunc + +function ff_hevc_put_qpel_h3v2_neon_8, export=1 + hevc_put_qpel_hXvY_neon_8 qpel_filter_3 qpel_filter_2_32b +endfunc + +function ff_hevc_put_qpel_h1v3_neon_8, export=1 + hevc_put_qpel_hXvY_neon_8 qpel_filter_1 qpel_filter_3_32b +endfunc + +function ff_hevc_put_qpel_h2v3_neon_8, export=1 + hevc_put_qpel_hXvY_neon_8 qpel_filter_2 qpel_filter_3_32b +endfunc + +function ff_hevc_put_qpel_h3v3_neon_8, export=1 + hevc_put_qpel_hXvY_neon_8 qpel_filter_3 qpel_filter_3_32b +endfunc + + +function ff_hevc_put_qpel_uw_h1v1_neon_8, export=1 + hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_1 qpel_filter_1_32b +endfunc + +function ff_hevc_put_qpel_uw_h2v1_neon_8, export=1 + hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_2 qpel_filter_1_32b +endfunc + +function ff_hevc_put_qpel_uw_h3v1_neon_8, export=1 + hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_3 qpel_filter_1_32b +endfunc + +function ff_hevc_put_qpel_uw_h1v2_neon_8, export=1 + hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_1 qpel_filter_2_32b +endfunc + +function ff_hevc_put_qpel_uw_h2v2_neon_8, export=1 + hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_2 qpel_filter_2_32b +endfunc + +function ff_hevc_put_qpel_uw_h3v2_neon_8, export=1 + hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_3 qpel_filter_2_32b +endfunc + +function ff_hevc_put_qpel_uw_h1v3_neon_8, export=1 + hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_1 qpel_filter_3_32b +endfunc + +function ff_hevc_put_qpel_uw_h2v3_neon_8, export=1 + hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_2 qpel_filter_3_32b +endfunc + +function ff_hevc_put_qpel_uw_h3v3_neon_8, export=1 + hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_3 qpel_filter_3_32b +endfunc + +.macro init_put_pixels + pld [r1] + pld [r1, r2] + mov r12, MAX_PB_SIZE + lsl r12, #1 +.endm + +function ff_hevc_put_pixels_w2_neon_8, export=1 + init_put_pixels + vmov.u8 d5, #255 + vshr.u64 d5, #32 +0: subs r3, #1 + vld1.32 {d0[0]}, [r1], r2 + pld [r1] + vld1.32 d6, [r0] + vshll.u8 q0, d0, #6 + vbit d6, d0, d5 + vst1.32 d6, [r0], r12 + bne 0b + bx lr +endfunc + +function ff_hevc_put_pixels_w4_neon_8, export=1 + init_put_pixels +0: subs r3, #2 + vld1.32 {d0[0]}, [r1], r2 + vld1.32 {d0[1]}, [r1], r2 + pld [r1] + pld [r1, r2] + vshll.u8 q0, d0, #6 + vst1.64 {d0}, [r0], r12 + vst1.64 {d1}, [r0], r12 + bne 0b + bx lr +endfunc + +function ff_hevc_put_pixels_w6_neon_8, export=1 + init_put_pixels + vmov.u8 q10, #255 + vshr.u64 d21, #32 +0: subs r3, #1 + vld1.16 {d0}, [r1], r2 + pld [r1] + vshll.u8 q0, d0, #6 + vld1.8 {q12}, [r0] + vbit q12, q0, q10 + vst1.8 {q12}, [r0], r12 + bne 0b + bx lr +endfunc + +function ff_hevc_put_pixels_w8_neon_8, export=1 + init_put_pixels +0: subs r3, #2 + vld1.8 {d0}, [r1], r2 + vld1.8 {d2}, [r1], r2 + pld [r1] + pld [r1, r2] + vshll.u8 q0, d0, #6 + vshll.u8 q1, d2, #6 + vst1.16 {q0}, [r0], r12 + vst1.16 {q1}, [r0], r12 + bne 0b + bx lr +endfunc + +function ff_hevc_put_pixels_w12_neon_8, export=1 + init_put_pixels +0: subs r3, #2 + vld1.64 {d0}, [r1] + add r1, #8 + vld1.32 {d1[0]}, [r1], r2 + sub r1, #8 + vld1.64 {d2}, [r1] + add r1, #8 + vld1.32 {d1[1]}, [r1], r2 + sub r1, #8 + pld [r1] + pld [r1, r2] + vshll.u8 q8, d0, #6 + vshll.u8 q9, d1, #6 + vshll.u8 q10, d2, #6 + vmov d22, d19 + vst1.64 {d16, d17, d18}, [r0], r12 + vst1.64 {d20, d21, d22}, [r0], r12 + bne 0b + bx lr +endfunc + +function ff_hevc_put_pixels_w16_neon_8, export=1 + init_put_pixels +0: subs r3, #2 + vld1.8 {q0}, [r1], r2 + vld1.8 {q1}, [r1], r2 + pld [r1] + pld [r1, r2] + vshll.u8 q8, d0, #6 + vshll.u8 q9, d1, #6 + vshll.u8 q10, d2, #6 + vshll.u8 q11, d3, #6 + vst1.8 {q8, q9}, [r0], r12 + vst1.8 {q10, q11}, [r0], r12 + bne 0b + bx lr +endfunc + +function ff_hevc_put_pixels_w24_neon_8, export=1 + init_put_pixels +0: subs r3, #1 + vld1.8 {d0, d1, d2}, [r1], r2 + pld [r1] + vshll.u8 q10, d0, #6 + vshll.u8 q11, d1, #6 + vshll.u8 q12, d2, #6 + vstm r0, {q10, q11, q12} + add r0, r12 + bne 0b + bx lr +endfunc + +function ff_hevc_put_pixels_w32_neon_8, export=1 + init_put_pixels +0: subs r3, #1 + vld1.8 {q0, q1}, [r1], r2 + pld [r1] + vshll.u8 q8, d0, #6 + vshll.u8 q9, d1, #6 + vshll.u8 q10, d2, #6 + vshll.u8 q11, d3, #6 + vstm r0, {q8, q9, q10, q11} + add r0, r12 + bne 0b + bx lr +endfunc + +function ff_hevc_put_pixels_w48_neon_8, export=1 + init_put_pixels +0: subs r3, #1 + vld1.8 {q0, q1}, [r1] + add r1, #32 + vld1.8 {q2}, [r1], r2 + sub r1, #32 + pld [r1] + vshll.u8 q8, d0, #6 + vshll.u8 q9, d1, #6 + vshll.u8 q10, d2, #6 + vshll.u8 q11, d3, #6 + vshll.u8 q12, d4, #6 + vshll.u8 q13, d5, #6 + vstm r0, {q8, q9, q10, q11, q12, q13} + add r0, r12 + bne 0b + bx lr +endfunc + +function ff_hevc_put_pixels_w64_neon_8, export=1 + init_put_pixels +0: subs r3, #1 + vld1.8 {q0, q1}, [r1] + add r1, #32 + vld1.8 {q2, q3}, [r1], r2 + sub r1, #32 + pld [r1] + vshll.u8 q8, d0, #6 + vshll.u8 q9, d1, #6 + vshll.u8 q10, d2, #6 + vshll.u8 q11, d3, #6 + vshll.u8 q12, d4, #6 + vshll.u8 q13, d5, #6 + vshll.u8 q14, d6, #6 + vshll.u8 q15, d7, #6 + vstm r0, {q8, q9, q10, q11, q12, q13, q14, q15} + add r0, r12 + bne 0b + bx lr +endfunc + +function ff_hevc_put_qpel_uw_pixels_neon_8, export=1 + push {r4-r9} + ldr r5, [sp, #24] // width + ldr r4, [sp, #28] // height + ldr r8, [sp, #32] // src2 + ldr r9, [sp, #36] // src2stride + vpush {d8-d15} + cmp r8, #0 + bne 2f +1: subs r4, #1 + vld1.8 {d0}, [r2], r3 + vst1.8 d0, [r0], r1 + bne 1b + vpop {d8-d15} + pop {r4-r9} + bx lr +2: subs r4, #1 + vld1.8 {d0}, [r2], r3 + vld1.16 {q1}, [r8], r9 + vshll.u8 q0, d0, #6 + vqadd.s16 q0, q1 + vqrshrun.s16 d0, q0, #7 + vst1.8 d0, [r0], r1 + bne 2b + vpop {d8-d15} + pop {r4-r9} + bx lr +endfunc + +.macro put_qpel_uw_pixels width, regs, regs2, regs3, regs4 +function ff_hevc_put_qpel_uw_pixels_w\width\()_neon_8, export=1 + ldr r12, [sp] // height +1: subs r12, #4 + vld1.32 {\regs} , [r2], r3 + vld1.32 {\regs2} , [r2], r3 + vld1.32 {\regs3} , [r2], r3 + vld1.32 {\regs4} , [r2], r3 + vst1.32 {\regs} , [r0], r1 + vst1.32 {\regs2} , [r0], r1 + vst1.32 {\regs3} , [r0], r1 + vst1.32 {\regs4} , [r0], r1 + bne 1b + bx lr +endfunc +.endm + +.macro put_qpel_uw_pixels_m width, regs, regs2, regs3, regs4 +function ff_hevc_put_qpel_uw_pixels_w\width\()_neon_8, export=1 + push {r4-r5} + ldr r12, [sp, #8] // height +1: subs r12, #2 + mov r4, r2 + vld1.32 {\regs} , [r2]! + vld1.32 {\regs2} , [r2] + add r2, r4, r3 + mov r4, r2 + vld1.32 {\regs3} , [r2]! + vld1.32 {\regs4} , [r2] + add r2, r4, r3 + mov r5, r0 + vst1.32 {\regs} , [r0]! + vst1.32 {\regs2} , [r0] + add r0, r5, r1 + mov r5, r0 + vst1.32 {\regs3} , [r0]! + vst1.32 {\regs4} , [r0] + add r0, r5, r1 + bne 1b + pop {r4-r5} + bx lr +endfunc +.endm + +put_qpel_uw_pixels 4, d0[0], d0[1], d1[0], d1[1] +put_qpel_uw_pixels 8, d0, d1, d2, d3 +put_qpel_uw_pixels_m 12, d0, d1[0], d2, d3[0] +put_qpel_uw_pixels 16, q0, q1, q2, q3 +put_qpel_uw_pixels 24, d0-d2, d3-d5, d16-d18, d19-d21 +put_qpel_uw_pixels 32, q0-q1, q2-q3, q8-q9, q10-q11 +put_qpel_uw_pixels_m 48, q0-q1, q2, q8-q9, q10 +put_qpel_uw_pixels_m 64, q0-q1, q2-q3, q8-q9, q10-q11 diff --git a/arm/neon.S b/arm/neon.S new file mode 100644 index 0000000..787bc4b --- /dev/null +++ b/arm/neon.S @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2008 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +.macro transpose_8x8 r0, r1, r2, r3, r4, r5, r6, r7 + vtrn.32 \r0, \r4 + vtrn.32 \r1, \r5 + vtrn.32 \r2, \r6 + vtrn.32 \r3, \r7 + vtrn.16 \r0, \r2 + vtrn.16 \r1, \r3 + vtrn.16 \r4, \r6 + vtrn.16 \r5, \r7 + vtrn.8 \r0, \r1 + vtrn.8 \r2, \r3 + vtrn.8 \r4, \r5 + vtrn.8 \r6, \r7 +.endm + +.macro transpose_4x4 r0, r1, r2, r3 + vtrn.16 \r0, \r2 + vtrn.16 \r1, \r3 + vtrn.8 \r0, \r1 + vtrn.8 \r2, \r3 +.endm + +.macro swap4 r0, r1, r2, r3, r4, r5, r6, r7 + vswp \r0, \r4 + vswp \r1, \r5 + vswp \r2, \r6 + vswp \r3, \r7 +.endm + +.macro transpose16_4x4 r0, r1, r2, r3, r4, r5, r6, r7 + vtrn.32 \r0, \r2 + vtrn.32 \r1, \r3 + vtrn.32 \r4, \r6 + vtrn.32 \r5, \r7 + vtrn.16 \r0, \r1 + vtrn.16 \r2, \r3 + vtrn.16 \r4, \r5 + vtrn.16 \r6, \r7 +.endm diff --git a/bitstream.cc b/bitstream.cc new file mode 100644 index 0000000..0298be9 --- /dev/null +++ b/bitstream.cc @@ -0,0 +1,176 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "bitstream.h" +#include "de265.h" + +#include +#include +#include + + + +void bitreader_init(bitreader* br, unsigned char* buffer, int len) +{ + br->data = buffer; + br->bytes_remaining = len; + + br->nextbits=0; + br->nextbits_cnt=0; + + bitreader_refill(br); +} + +void bitreader_refill(bitreader* br) +{ + int shift = 64-br->nextbits_cnt; + + while (shift >= 8 && br->bytes_remaining) { + uint64_t newval = *br->data++; + br->bytes_remaining--; + + shift -= 8; + newval <<= shift; + br->nextbits |= newval; + } + + br->nextbits_cnt = 64-shift; +} + +int get_bits(bitreader* br, int n) +{ + if (br->nextbits_cnt < n) { + bitreader_refill(br); + } + + uint64_t val = br->nextbits; + val >>= 64-n; + + br->nextbits <<= n; + br->nextbits_cnt -= n; + + return val; +} + +int get_bits_fast(bitreader* br, int n) +{ + assert(br->nextbits_cnt >= n); + + uint64_t val = br->nextbits; + val >>= 64-n; + + br->nextbits <<= n; + br->nextbits_cnt -= n; + + return val; +} + +int peek_bits(bitreader* br, int n) +{ + if (br->nextbits_cnt < n) { + bitreader_refill(br); + } + + uint64_t val = br->nextbits; + val >>= 64-n; + + return val; +} + +void skip_bits(bitreader* br, int n) +{ + if (br->nextbits_cnt < n) { + bitreader_refill(br); + } + + br->nextbits <<= n; + br->nextbits_cnt -= n; +} + +void skip_bits_fast(bitreader* br, int n) +{ + br->nextbits <<= n; + br->nextbits_cnt -= n; +} + +void skip_to_byte_boundary(bitreader* br) +{ + int nskip = (br->nextbits_cnt & 7); + + br->nextbits <<= nskip; + br->nextbits_cnt -= nskip; +} + +void prepare_for_CABAC(bitreader* br) +{ + skip_to_byte_boundary(br); + + int rewind = br->nextbits_cnt/8; + br->data -= rewind; + br->bytes_remaining += rewind; + br->nextbits = 0; + br->nextbits_cnt = 0; +} + +int get_uvlc(bitreader* br) +{ + int num_zeros=0; + + while (get_bits(br,1)==0) { + num_zeros++; + + if (num_zeros > MAX_UVLC_LEADING_ZEROS) { return UVLC_ERROR; } + } + + int offset = 0; + if (num_zeros != 0) { + offset = get_bits(br, num_zeros); + int value = offset + (1<0); + return value; + } else { + return 0; + } +} + +int get_svlc(bitreader* br) +{ + int v = get_uvlc(br); + if (v==0) return v; + if (v==UVLC_ERROR) return UVLC_ERROR; + + bool negative = ((v&1)==0); + return negative ? -v/2 : (v+1)/2; +} + +bool check_rbsp_trailing_bits(bitreader* br) +{ + int stop_bit = get_bits(br,1); + assert(stop_bit==1); + + while (br->nextbits_cnt>0 || br->bytes_remaining>0) { + int filler = get_bits(br,1); + if (filler!=0) { + return false; + } + } + + return true; +} diff --git a/cabac.cc b/cabac.cc new file mode 100644 index 0000000..102bc57 --- /dev/null +++ b/cabac.cc @@ -0,0 +1,1033 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "cabac.h" +#include "util.h" + +#include +#include +#include +#include + +#define INITIAL_CABAC_BUFFER_CAPACITY 4096 + + +static const uint8_t LPS_table[64][4] = + { + { 128, 176, 208, 240}, + { 128, 167, 197, 227}, + { 128, 158, 187, 216}, + { 123, 150, 178, 205}, + { 116, 142, 169, 195}, + { 111, 135, 160, 185}, + { 105, 128, 152, 175}, + { 100, 122, 144, 166}, + { 95, 116, 137, 158}, + { 90, 110, 130, 150}, + { 85, 104, 123, 142}, + { 81, 99, 117, 135}, + { 77, 94, 111, 128}, + { 73, 89, 105, 122}, + { 69, 85, 100, 116}, + { 66, 80, 95, 110}, + { 62, 76, 90, 104}, + { 59, 72, 86, 99}, + { 56, 69, 81, 94}, + { 53, 65, 77, 89}, + { 51, 62, 73, 85}, + { 48, 59, 69, 80}, + { 46, 56, 66, 76}, + { 43, 53, 63, 72}, + { 41, 50, 59, 69}, + { 39, 48, 56, 65}, + { 37, 45, 54, 62}, + { 35, 43, 51, 59}, + { 33, 41, 48, 56}, + { 32, 39, 46, 53}, + { 30, 37, 43, 50}, + { 29, 35, 41, 48}, + { 27, 33, 39, 45}, + { 26, 31, 37, 43}, + { 24, 30, 35, 41}, + { 23, 28, 33, 39}, + { 22, 27, 32, 37}, + { 21, 26, 30, 35}, + { 20, 24, 29, 33}, + { 19, 23, 27, 31}, + { 18, 22, 26, 30}, + { 17, 21, 25, 28}, + { 16, 20, 23, 27}, + { 15, 19, 22, 25}, + { 14, 18, 21, 24}, + { 14, 17, 20, 23}, + { 13, 16, 19, 22}, + { 12, 15, 18, 21}, + { 12, 14, 17, 20}, + { 11, 14, 16, 19}, + { 11, 13, 15, 18}, + { 10, 12, 15, 17}, + { 10, 12, 14, 16}, + { 9, 11, 13, 15}, + { 9, 11, 12, 14}, + { 8, 10, 12, 14}, + { 8, 9, 11, 13}, + { 7, 9, 11, 12}, + { 7, 9, 10, 12}, + { 7, 8, 10, 11}, + { 6, 8, 9, 11}, + { 6, 7, 9, 10}, + { 6, 7, 8, 9}, + { 2, 2, 2, 2} + }; + +static const uint8_t renorm_table[32] = + { + 6, 5, 4, 4, + 3, 3, 3, 3, + 2, 2, 2, 2, + 2, 2, 2, 2, + 1, 1, 1, 1, + 1, 1, 1, 1, + 1, 1, 1, 1, + 1, 1, 1, 1 + }; + +static const uint8_t next_state_MPS[64] = + { + 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16, + 17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32, + 33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48, + 49,50,51,52,53,54,55,56,57,58,59,60,61,62,62,63 + }; + +static const uint8_t next_state_LPS[64] = + { + 0,0,1,2,2,4,4,5,6,7,8,9,9,11,11,12, + 13,13,15,15,16,16,18,18,19,19,21,21,22,22,23,24, + 24,25,26,26,27,27,28,29,29,30,30,30,31,32,32,33, + 33,33,34,34,35,35,35,36,36,36,37,37,37,38,38,63 + }; + + + + + +#ifdef DE265_LOG_TRACE +int logcnt=1; +#endif + +void init_CABAC_decoder(CABAC_decoder* decoder, uint8_t* bitstream, int length) +{ + assert(length >= 0); + + decoder->bitstream_start = bitstream; + decoder->bitstream_curr = bitstream; + decoder->bitstream_end = bitstream+length; +} + +void init_CABAC_decoder_2(CABAC_decoder* decoder) +{ + int length = decoder->bitstream_end - decoder->bitstream_curr; + + decoder->range = 510; + decoder->bits_needed = 8; + + decoder->value = 0; + + if (length>0) { decoder->value = (*decoder->bitstream_curr++) << 8; decoder->bits_needed-=8; } + if (length>1) { decoder->value |= (*decoder->bitstream_curr++); decoder->bits_needed-=8; } + + logtrace(LogCABAC,"[%3d] init_CABAC_decode_2 r:%x v:%x\n", logcnt, decoder->range, decoder->value); +} + + +int decode_CABAC_bit(CABAC_decoder* decoder, context_model* model) +{ + logtrace(LogCABAC,"[%3d] decodeBin r:%x v:%x state:%d\n",logcnt,decoder->range, decoder->value, model->state); + + int decoded_bit; + int LPS = LPS_table[model->state][ ( decoder->range >> 6 ) - 4 ]; + decoder->range -= LPS; + + uint32_t scaled_range = decoder->range << 7; + + logtrace(LogCABAC,"[%3d] sr:%x v:%x\n",logcnt,scaled_range, decoder->value); + + if (decoder->value < scaled_range) + { + logtrace(LogCABAC,"[%3d] MPS\n",logcnt); + + // MPS path + + decoded_bit = model->MPSbit; + model->state = next_state_MPS[model->state]; + + if (scaled_range < ( 256 << 7 ) ) + { + // scaled range, highest bit (15) not set + + decoder->range = scaled_range >> 6; // shift range by one bit + decoder->value <<= 1; // shift value by one bit + decoder->bits_needed++; + + if (decoder->bits_needed == 0) + { + decoder->bits_needed = -8; + if (decoder->bitstream_curr < decoder->bitstream_end) + { decoder->value |= *decoder->bitstream_curr++; } + } + } + } + else + { + logtrace(LogCABAC,"[%3d] LPS\n",logcnt); + //printf("%d %d\n", model->state, 0); + + // LPS path + + decoder->value = (decoder->value - scaled_range); + + int num_bits = renorm_table[ LPS >> 3 ]; + decoder->value <<= num_bits; + decoder->range = LPS << num_bits; /* this is always >= 0x100 except for state 63, + but state 63 is never used */ + + int num_bitsTab = renorm_table[ LPS >> 3 ]; + + assert(num_bits == num_bitsTab); + + decoded_bit = 1 - model->MPSbit; + + if (model->state==0) { model->MPSbit = 1-model->MPSbit; } + model->state = next_state_LPS[model->state]; + + decoder->bits_needed += num_bits; + + if (decoder->bits_needed >= 0) + { + logtrace(LogCABAC,"bits_needed: %d\n", decoder->bits_needed); + if (decoder->bitstream_curr < decoder->bitstream_end) + { decoder->value |= (*decoder->bitstream_curr++) << decoder->bits_needed; } + + decoder->bits_needed -= 8; + } + } + + logtrace(LogCABAC,"[%3d] -> bit %d r:%x v:%x\n", logcnt, decoded_bit, decoder->range, decoder->value); +#ifdef DE265_LOG_TRACE + logcnt++; +#endif + + return decoded_bit; +} + +int decode_CABAC_term_bit(CABAC_decoder* decoder) +{ + logtrace(LogCABAC,"CABAC term: range=%x\n", decoder->range); + + decoder->range -= 2; + uint32_t scaledRange = decoder->range << 7; + + if (decoder->value >= scaledRange) + { + return 1; + } + else + { + // there is a while loop in the standard, but it will always be executed only once + + if (scaledRange < (256<<7)) + { + decoder->range = scaledRange >> 6; + decoder->value *= 2; + + decoder->bits_needed++; + if (decoder->bits_needed==0) + { + decoder->bits_needed = -8; + + if (decoder->bitstream_curr < decoder->bitstream_end) { + decoder->value += (*decoder->bitstream_curr++); + } + } + } + + return 0; + } +} + + + +int decode_CABAC_bypass(CABAC_decoder* decoder) +{ + logtrace(LogCABAC,"[%3d] bypass r:%x v:%x\n",logcnt,decoder->range, decoder->value); + + decoder->value <<= 1; + decoder->bits_needed++; + + if (decoder->bits_needed >= 0) + { + if (decoder->bitstream_end > decoder->bitstream_curr) { + decoder->bits_needed = -8; + decoder->value |= *decoder->bitstream_curr++; + } + } + + int bit; + uint32_t scaled_range = decoder->range << 7; + if (decoder->value >= scaled_range) + { + decoder->value -= scaled_range; + bit=1; + } + else + { + bit=0; + } + + logtrace(LogCABAC,"[%3d] -> bit %d r:%x v:%x\n", logcnt, bit, decoder->range, decoder->value); +#ifdef DE265_LOG_TRACE + logcnt++; +#endif + + return bit; +} + + +int decode_CABAC_TU_bypass(CABAC_decoder* decoder, int cMax) +{ + for (int i=0;irange, decoder->value, nBits); + + decoder->value <<= nBits; + decoder->bits_needed+=nBits; + + if (decoder->bits_needed >= 0) + { + if (decoder->bitstream_end > decoder->bitstream_curr) { + int input = *decoder->bitstream_curr++; + input <<= decoder->bits_needed; + + decoder->bits_needed -= 8; + decoder->value |= input; + } + } + + uint32_t scaled_range = decoder->range << 7; + int value = decoder->value / scaled_range; + if (unlikely(value>=(1<value -= value * scaled_range; + + logtrace(LogCABAC,"[%3d] -> value %d r:%x v:%x\n", logcnt+nBits-1, + value, decoder->range, decoder->value); + +#ifdef DE265_LOG_TRACE + logcnt+=nBits; +#endif + + return value; +} + + +int decode_CABAC_FL_bypass(CABAC_decoder* decoder, int nBits) +{ + int value=0; + + if (likely(nBits<=8)) { + if (nBits==0) { + return 0; + } + // we could use decode_CABAC_bypass() for a single bit, but this seems to be slower +#if 0 + else if (nBits==1) { + value = decode_CABAC_bypass(decoder); + } +#endif + else { + value = decode_CABAC_FL_bypass_parallel(decoder,nBits); + } + } + else { + value = decode_CABAC_FL_bypass_parallel(decoder,8); + nBits-=8; + + while (nBits--) { + value <<= 1; + value |= decode_CABAC_bypass(decoder); + } + } + logtrace(LogCABAC," -> FL: %d\n", value); + + return value; +} + +int decode_CABAC_TR_bypass(CABAC_decoder* decoder, int cRiceParam, int cTRMax) +{ + int prefix = decode_CABAC_TU_bypass(decoder, cTRMax>>cRiceParam); + if (prefix==4) { // TODO check: constant 4 only works for coefficient decoding + return cTRMax; + } + + int suffix = decode_CABAC_FL_bypass(decoder, cRiceParam); + + return (prefix << cRiceParam) | suffix; +} + + +#define MAX_PREFIX 32 + +int decode_CABAC_EGk_bypass(CABAC_decoder* decoder, int k) +{ + int base=0; + int n=k; + + for (;;) + { + int bit = decode_CABAC_bypass(decoder); + if (bit==0) + break; + else { + base += 1<=8) { + append_byte((vlc_buffer >> (vlc_buffer_len-8)) & 0xFF); + vlc_buffer_len -= 8; + } +} + +void CABAC_encoder::write_uvlc(int value) +{ + assert(value>=0); + + int nLeadingZeros=0; + int base=0; + int range=1; + + while (value>=base+range) { + base += range; + range <<= 1; + nLeadingZeros++; + } + + write_bits((1<0) write_uvlc(2*value-1); + else write_uvlc(-2*value); +} + +void CABAC_encoder_bitstream::flush_VLC() +{ + while (vlc_buffer_len>=8) { + append_byte((vlc_buffer >> (vlc_buffer_len-8)) & 0xFF); + vlc_buffer_len -= 8; + } + + if (vlc_buffer_len>0) { + append_byte(vlc_buffer << (8-vlc_buffer_len)); + vlc_buffer_len = 0; + } + + vlc_buffer = 0; +} + +void CABAC_encoder_bitstream::skip_bits(int nBits) +{ + while (nBits>=8) { + write_bits(0,8); + nBits-=8; + } + + if (nBits>0) { + write_bits(0,nBits); + } +} + + +int CABAC_encoder_bitstream::number_free_bits_in_byte() const +{ + if ((vlc_buffer_len % 8)==0) return 0; + return 8- (vlc_buffer_len % 8); +} + + +void CABAC_encoder_bitstream::check_size_and_resize(int nBytes) +{ + if (data_size+nBytes > data_capacity) { // 1 extra byte for stuffing + if (data_capacity==0) { + data_capacity = INITIAL_CABAC_BUFFER_CAPACITY; + } else { + data_capacity *= 2; + } + + data_mem = (uint8_t*)realloc(data_mem,data_capacity); + } +} + + +void CABAC_encoder_bitstream::append_byte(int byte) +{ + check_size_and_resize(2); + + // --- emulation prevention --- + + /* These byte sequences may never occur in the bitstream: + 0x000000 / 0x000001 / 0x000002 + + Hence, we have to add a 0x03 before the third byte. + We also have to add a 0x03 for this sequence: 0x000003, because + the escape byte itself also has to be escaped. + */ + + // S0 --(0)--> S1 --(0)--> S2 --(0,1,2,3)--> add stuffing + + if (byte<=3) { + /**/ if (state< 2 && byte==0) { state++; } + else if (state==2 && byte<=3) { + data_mem[ data_size++ ] = 3; + + if (byte==0) state=1; + else state=0; + } + else { state=0; } + } + else { state=0; } + + + // write actual data byte + + data_mem[ data_size++ ] = byte; +} + + +void CABAC_encoder_bitstream::write_startcode() +{ + check_size_and_resize(3); + + data_mem[ data_size+0 ] = 0; + data_mem[ data_size+1 ] = 0; + data_mem[ data_size+2 ] = 1; + data_size+=3; +} + +void CABAC_encoder_bitstream::init_CABAC() +{ + range = 510; + low = 0; + + bits_left = 23; + buffered_byte = 0xFF; + num_buffered_bytes = 0; +} + +void CABAC_encoder_bitstream::flush_CABAC() +{ + if (low >> (32 - bits_left)) + { + append_byte(buffered_byte + 1); + while (num_buffered_bytes > 1) + { + append_byte(0x00); + num_buffered_bytes--; + } + + low -= 1 << (32 - bits_left); + } + else + { + if (num_buffered_bytes > 0) + { + append_byte(buffered_byte); + } + + while (num_buffered_bytes > 1) + { + append_byte(0xff); + num_buffered_bytes--; + } + } + + // printf("low: %08x nbits left:%d filled:%d\n",low,bits_left,32-bits_left); + + write_bits(low >> 8, 24-bits_left); +} + + +void CABAC_encoder_bitstream::write_out() +{ + //logtrace(LogCABAC,"low = %08x (bits_left=%d)\n",low,bits_left); + int leadByte = low >> (24 - bits_left); + bits_left += 8; + low &= 0xffffffffu >> bits_left; + + //logtrace(LogCABAC,"write byte %02x\n",leadByte); + //logtrace(LogCABAC,"-> low = %08x\n",low); + + if (leadByte == 0xff) + { + num_buffered_bytes++; + } + else + { + if (num_buffered_bytes > 0) + { + int carry = leadByte >> 8; + int byte = buffered_byte + carry; + buffered_byte = leadByte & 0xff; + append_byte(byte); + + byte = ( 0xff + carry ) & 0xff; + while ( num_buffered_bytes > 1 ) + { + append_byte(byte); + num_buffered_bytes--; + } + } + else + { + num_buffered_bytes = 1; + buffered_byte = leadByte; + } + } +} + +void CABAC_encoder_bitstream::testAndWriteOut() +{ + // logtrace(LogCABAC,"bits_left = %d\n",bits_left); + + if (bits_left < 12) + { + write_out(); + } +} + + +#ifdef DE265_LOG_TRACE +int encBinCnt=1; +#endif + +void CABAC_encoder_bitstream::write_CABAC_bit(int modelIdx, int bin) +{ + context_model* model = &(*mCtxModels)[modelIdx]; + //m_uiBinsCoded += m_binCountIncrement; + //rcCtxModel.setBinsCoded( 1 ); + + logtrace(LogCABAC,"[%d] range=%x low=%x state=%d, bin=%d\n", + encBinCnt, range,low, model->state,bin); + + /* + printf("[%d] range=%x low=%x state=%d, bin=%d\n", + encBinCnt, range,low, model->state,bin); + + printf("%d %d X\n",model->state,bin != model->MPSbit); + */ + +#ifdef DE265_LOG_TRACE + encBinCnt++; +#endif + + uint32_t LPS = LPS_table[model->state][ ( range >> 6 ) - 4 ]; + range -= LPS; + + if (bin != model->MPSbit) + { + //logtrace(LogCABAC,"LPS\n"); + + int num_bits = renorm_table[ LPS >> 3 ]; + low = (low + range) << num_bits; + range = LPS << num_bits; + + if (model->state==0) { model->MPSbit = 1-model->MPSbit; } + + model->state = next_state_LPS[model->state]; + + bits_left -= num_bits; + } + else + { + //logtrace(LogCABAC,"MPS\n"); + + model->state = next_state_MPS[model->state]; + + + // renorm + + if (range >= 256) { return; } + + low <<= 1; + range <<= 1; + bits_left--; + } + + testAndWriteOut(); +} + +void CABAC_encoder_bitstream::write_CABAC_bypass(int bin) +{ + logtrace(LogCABAC,"[%d] bypass = %d, range=%x\n",encBinCnt,bin,range); + /* + printf("[%d] bypass = %d, range=%x\n",encBinCnt,bin,range); + printf("%d %d X\n",64, -1); + */ + +#ifdef DE265_LOG_TRACE + encBinCnt++; +#endif + + // BinsCoded += m_binCountIncrement; + low <<= 1; + + if (bin) + { + low += range; + } + bits_left--; + + testAndWriteOut(); +} + +void CABAC_encoder::write_CABAC_TU_bypass(int value, int cMax) +{ + for (int i=0;i0) { + n--; + write_CABAC_bypass(value & (1<= 256) + { + return; + } + else + { + low <<= 1; + range <<= 1; + bits_left--; + } + + testAndWriteOut(); +} + + + + +static const uint32_t entropy_table[128] = { + // -------------------- 200 -------------------- + /* state= 0 */ 0x07d13 /* 0.977164 */, 0x08255 /* 1.018237 */, + /* state= 1 */ 0x07738 /* 0.931417 */, 0x086ef /* 1.054179 */, + /* state= 2 */ 0x0702b /* 0.876323 */, 0x0935a /* 1.151195 */, + /* state= 3 */ 0x069e6 /* 0.827333 */, 0x09c7f /* 1.222650 */, + /* state= 4 */ 0x062e8 /* 0.772716 */, 0x0a2c7 /* 1.271708 */, + /* state= 5 */ 0x05c18 /* 0.719488 */, 0x0ae25 /* 1.360532 */, + /* state= 6 */ 0x05632 /* 0.673414 */, 0x0b724 /* 1.430793 */, + /* state= 7 */ 0x05144 /* 0.634904 */, 0x0c05d /* 1.502850 */, + /* state= 8 */ 0x04bdf /* 0.592754 */, 0x0ccf2 /* 1.601145 */, + /* state= 9 */ 0x0478d /* 0.559012 */, 0x0d57b /* 1.667843 */, + /* state=10 */ 0x042ad /* 0.520924 */, 0x0de81 /* 1.738336 */, + /* state=11 */ 0x03f4d /* 0.494564 */, 0x0e4b8 /* 1.786871 */, + /* state=12 */ 0x03a9d /* 0.457945 */, 0x0f471 /* 1.909721 */, + /* state=13 */ 0x037d5 /* 0.436201 */, 0x0fc56 /* 1.971385 */, + /* state=14 */ 0x034c2 /* 0.412177 */, 0x10236 /* 2.017284 */, + /* state=15 */ 0x031a6 /* 0.387895 */, 0x10d5c /* 2.104394 */, + /* state=16 */ 0x02e62 /* 0.362383 */, 0x11b34 /* 2.212552 */, + /* state=17 */ 0x02c20 /* 0.344752 */, 0x120b4 /* 2.255512 */, + /* state=18 */ 0x029b8 /* 0.325943 */, 0x1294d /* 2.322672 */, + /* state=19 */ 0x02791 /* 0.309143 */, 0x135e1 /* 2.420959 */, + /* state=20 */ 0x02562 /* 0.292057 */, 0x13e37 /* 2.486077 */, + /* state=21 */ 0x0230d /* 0.273846 */, 0x144fd /* 2.539000 */, + /* state=22 */ 0x02193 /* 0.262308 */, 0x150c9 /* 2.631150 */, + /* state=23 */ 0x01f5d /* 0.245026 */, 0x15ca0 /* 2.723641 */, + /* state=24 */ 0x01de7 /* 0.233617 */, 0x162f9 /* 2.773246 */, + /* state=25 */ 0x01c2f /* 0.220208 */, 0x16d99 /* 2.856259 */, + /* state=26 */ 0x01a8e /* 0.207459 */, 0x17a93 /* 2.957634 */, + /* state=27 */ 0x0195a /* 0.198065 */, 0x18051 /* 3.002477 */, + /* state=28 */ 0x01809 /* 0.187778 */, 0x18764 /* 3.057759 */, + /* state=29 */ 0x0164a /* 0.174144 */, 0x19460 /* 3.159206 */, + /* state=30 */ 0x01539 /* 0.165824 */, 0x19f20 /* 3.243181 */, + /* state=31 */ 0x01452 /* 0.158756 */, 0x1a465 /* 3.284334 */, + /* state=32 */ 0x0133b /* 0.150261 */, 0x1b422 /* 3.407303 */, + /* state=33 */ 0x0120c /* 0.140995 */, 0x1bce5 /* 3.475767 */, + /* state=34 */ 0x01110 /* 0.133315 */, 0x1c394 /* 3.527962 */, + /* state=35 */ 0x0104d /* 0.127371 */, 0x1d059 /* 3.627736 */, + /* state=36 */ 0x00f8b /* 0.121451 */, 0x1d74b /* 3.681983 */, + /* state=37 */ 0x00ef4 /* 0.116829 */, 0x1dfd0 /* 3.748540 */, + /* state=38 */ 0x00e10 /* 0.109864 */, 0x1e6d3 /* 3.803335 */, + /* state=39 */ 0x00d3f /* 0.103507 */, 0x1f925 /* 3.946462 */, + /* state=40 */ 0x00cc4 /* 0.099758 */, 0x1fda7 /* 3.981667 */, + /* state=41 */ 0x00c42 /* 0.095792 */, 0x203f8 /* 4.031012 */, + /* state=42 */ 0x00b78 /* 0.089610 */, 0x20f7d /* 4.121014 */, + /* state=43 */ 0x00afc /* 0.085830 */, 0x21dd6 /* 4.233102 */, + /* state=44 */ 0x00a5e /* 0.081009 */, 0x22419 /* 4.282016 */, + /* state=45 */ 0x00a1b /* 0.078950 */, 0x22a5e /* 4.331015 */, + /* state=46 */ 0x00989 /* 0.074514 */, 0x23756 /* 4.432323 */, + /* state=47 */ 0x0091b /* 0.071166 */, 0x24225 /* 4.516775 */, + /* state=48 */ 0x008cf /* 0.068837 */, 0x2471a /* 4.555487 */, + /* state=49 */ 0x00859 /* 0.065234 */, 0x25313 /* 4.649048 */, + /* state=50 */ 0x00814 /* 0.063140 */, 0x25d67 /* 4.729721 */, + /* state=51 */ 0x007b6 /* 0.060272 */, 0x2651f /* 4.790028 */, + /* state=52 */ 0x0076e /* 0.058057 */, 0x2687c /* 4.816294 */, + /* state=53 */ 0x00707 /* 0.054924 */, 0x27da7 /* 4.981661 */, + /* state=54 */ 0x006d5 /* 0.053378 */, 0x28172 /* 5.011294 */, + /* state=55 */ 0x00659 /* 0.049617 */, 0x28948 /* 5.072512 */, + /* state=56 */ 0x00617 /* 0.047598 */, 0x297c5 /* 5.185722 */, + /* state=57 */ 0x005dd /* 0.045814 */, 0x2a2df /* 5.272434 */, + /* state=58 */ 0x005c1 /* 0.044965 */, 0x2a581 /* 5.293019 */, + /* state=59 */ 0x00574 /* 0.042619 */, 0x2ad59 /* 5.354304 */, + /* state=60 */ 0x0053b /* 0.040882 */, 0x2bba5 /* 5.465973 */, + /* state=61 */ 0x0050c /* 0.039448 */, 0x2c596 /* 5.543651 */, + /* state=62 */ 0x004e9 /* 0.038377 */, 0x2cd88 /* 5.605741 */, + 0x00400 , 0x2d000 /* dummy, should never be used */ +}; + + +static const uint32_t entropy_table_orig[128] = { + 0x07b23, 0x085f9, 0x074a0, 0x08cbc, 0x06ee4, 0x09354, 0x067f4, 0x09c1b, + 0x060b0, 0x0a62a, 0x05a9c, 0x0af5b, 0x0548d, 0x0b955, 0x04f56, 0x0c2a9, + 0x04a87, 0x0cbf7, 0x045d6, 0x0d5c3, 0x04144, 0x0e01b, 0x03d88, 0x0e937, + 0x039e0, 0x0f2cd, 0x03663, 0x0fc9e, 0x03347, 0x10600, 0x03050, 0x10f95, + 0x02d4d, 0x11a02, 0x02ad3, 0x12333, 0x0286e, 0x12cad, 0x02604, 0x136df, + 0x02425, 0x13f48, 0x021f4, 0x149c4, 0x0203e, 0x1527b, 0x01e4d, 0x15d00, + 0x01c99, 0x166de, 0x01b18, 0x17017, 0x019a5, 0x17988, 0x01841, 0x18327, + 0x016df, 0x18d50, 0x015d9, 0x19547, 0x0147c, 0x1a083, 0x0138e, 0x1a8a3, + 0x01251, 0x1b418, 0x01166, 0x1bd27, 0x01068, 0x1c77b, 0x00f7f, 0x1d18e, + 0x00eda, 0x1d91a, 0x00e19, 0x1e254, 0x00d4f, 0x1ec9a, 0x00c90, 0x1f6e0, + 0x00c01, 0x1fef8, 0x00b5f, 0x208b1, 0x00ab6, 0x21362, 0x00a15, 0x21e46, + 0x00988, 0x2285d, 0x00934, 0x22ea8, 0x008a8, 0x239b2, 0x0081d, 0x24577, + 0x007c9, 0x24ce6, 0x00763, 0x25663, 0x00710, 0x25e8f, 0x006a0, 0x26a26, + 0x00672, 0x26f23, 0x005e8, 0x27ef8, 0x005ba, 0x284b5, 0x0055e, 0x29057, + 0x0050c, 0x29bab, 0x004c1, 0x2a674, 0x004a7, 0x2aa5e, 0x0046f, 0x2b32f, + 0x0041f, 0x2c0ad, 0x003e7, 0x2ca8d, 0x003ba, 0x2d323, 0x0010c, 0x3bfbb +}; + + +const uint32_t entropy_table_theory[128] = + { + 0x08000, 0x08000, 0x076da, 0x089a0, 0x06e92, 0x09340, 0x0670a, 0x09cdf, 0x06029, 0x0a67f, 0x059dd, 0x0b01f, 0x05413, 0x0b9bf, 0x04ebf, 0x0c35f, + 0x049d3, 0x0ccff, 0x04546, 0x0d69e, 0x0410d, 0x0e03e, 0x03d22, 0x0e9de, 0x0397d, 0x0f37e, 0x03619, 0x0fd1e, 0x032ee, 0x106be, 0x02ffa, 0x1105d, + 0x02d37, 0x119fd, 0x02aa2, 0x1239d, 0x02836, 0x12d3d, 0x025f2, 0x136dd, 0x023d1, 0x1407c, 0x021d2, 0x14a1c, 0x01ff2, 0x153bc, 0x01e2f, 0x15d5c, + 0x01c87, 0x166fc, 0x01af7, 0x1709b, 0x0197f, 0x17a3b, 0x0181d, 0x183db, 0x016d0, 0x18d7b, 0x01595, 0x1971b, 0x0146c, 0x1a0bb, 0x01354, 0x1aa5a, + 0x0124c, 0x1b3fa, 0x01153, 0x1bd9a, 0x01067, 0x1c73a, 0x00f89, 0x1d0da, 0x00eb7, 0x1da79, 0x00df0, 0x1e419, 0x00d34, 0x1edb9, 0x00c82, 0x1f759, + 0x00bda, 0x200f9, 0x00b3c, 0x20a99, 0x00aa5, 0x21438, 0x00a17, 0x21dd8, 0x00990, 0x22778, 0x00911, 0x23118, 0x00898, 0x23ab8, 0x00826, 0x24458, + 0x007ba, 0x24df7, 0x00753, 0x25797, 0x006f2, 0x26137, 0x00696, 0x26ad7, 0x0063f, 0x27477, 0x005ed, 0x27e17, 0x0059f, 0x287b6, 0x00554, 0x29156, + 0x0050e, 0x29af6, 0x004cc, 0x2a497, 0x0048d, 0x2ae35, 0x00451, 0x2b7d6, 0x00418, 0x2c176, 0x003e2, 0x2cb15, 0x003af, 0x2d4b5, 0x0037f, 0x2de55 + }; + + +void CABAC_encoder_estim::write_CABAC_bit(int modelIdx, int bit) +{ + context_model* model = &(*mCtxModels)[modelIdx]; + //printf("[%d] state=%d, bin=%d\n", encBinCnt, model->state,bit); + //encBinCnt++; + + int idx = model->state<<1; + + if (bit==model->MPSbit) { + model->state = next_state_MPS[model->state]; + } + else { + idx++; + if (model->state==0) { model->MPSbit = 1-model->MPSbit; } + model->state = next_state_LPS[model->state]; + } + + mFracBits += entropy_table[idx]; + + //printf("-> %08lx %f\n",entropy_table[idx], entropy_table[idx] / float(1<<15)); +} + + +float CABAC_encoder::RDBits_for_CABAC_bin(int modelIdx, int bit) +{ + context_model* model = &(*mCtxModels)[modelIdx]; + int idx = model->state<<1; + + if (bit!=model->MPSbit) { + idx++; + } + + return entropy_table[idx] / float(1<<15); +} + + +void CABAC_encoder::write_CABAC_EGk(int val, int k) +{ + while (val >= ( 1 << k ) ) { + write_CABAC_bypass(1); + val = val - ( 1 << k ); + k++; + } + + write_CABAC_bypass(0); + + while (k) { + k--; + write_CABAC_bypass((val >> k) & 1); + } +} + + + +void CABAC_encoder_estim_constant::write_CABAC_bit(int modelIdx, int bit) +{ + context_model* model = &(*mCtxModels)[modelIdx]; + int idx = model->state<<1; + + if (bit!=model->MPSbit) { + idx++; + } + + mFracBits += entropy_table[idx]; +} + + + +#if 0 +void printtab(int idx,int s) +{ + printf("%d %f %f %f\n", s, + double(entropy_table[idx])/0x8000, + double(entropy_table_orig[idx])/0x8000, + double(entropy_table_f265[idx])/0x8000); +} + + +void plot_tables() +{ + for (int i=-62;i<=0;i++) { + int idx = -i *2; + int s = i; + printtab(idx,s); + } + + for (int i=0;i<=62;i++) { + int idx = 2*i +1; + int s = i; + printtab(idx,s); + } +} +#endif diff --git a/configparam.cc b/configparam.cc new file mode 100644 index 0000000..d944141 --- /dev/null +++ b/configparam.cc @@ -0,0 +1,491 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * Authors: struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "configparam.h" + +#include +#include +#include +#include +#include +#include +#include + +#ifndef RTTI_ENABLED +#error "Need to compile with RTTI enabled." +#endif + +static void remove_option(int* argc,char** argv,int idx, int n=1) +{ + for (int i=idx+n;i<*argc;i++) { + argv[i-n] = argv[i]; + } + + *argc-=n; +} + + +bool option_string::processCmdLineArguments(char** argv, int* argc, int idx) +{ + if (argv==NULL) { return false; } + if (idx >= *argc) { return false; } + + value = argv[idx]; + value_set = true; + + remove_option(argc,argv,idx,1); + + return true; +} + + +void option_int::set_range(int mini,int maxi) +{ + have_low_limit =true; + have_high_limit=true; + low_limit =mini; + high_limit=maxi; +} + +std::string option_int::getTypeDescr() const +{ + std::stringstream sstr; + sstr << "(int)"; + + if (have_low_limit || have_high_limit) { sstr << " "; } + if (have_low_limit) { sstr << low_limit << " <= "; } + if (have_low_limit || have_high_limit) { sstr << "x"; } + if (have_high_limit) { sstr << " <= " << high_limit; } + + if (!valid_values_set.empty()) { + sstr << " {"; + bool first=true; + FOR_LOOP(int, v, valid_values_set) { + if (!first) sstr << ","; else first=false; + sstr << v; + } + sstr << "}"; + } + + return sstr.str(); +} + +bool option_int::processCmdLineArguments(char** argv, int* argc, int idx) +{ + if (argv==NULL) { return false; } + if (idx >= *argc) { return false; } + + int v = atoi(argv[idx]); + if (!is_valid(v)) { return false; } + + value = v; + value_set = true; + + remove_option(argc,argv,idx,1); + + return true; +} + +bool option_int::is_valid(int v) const +{ + if (have_low_limit && vhigh_limit) { return false; } + + if (!valid_values_set.empty()) { + auto iter = std::find(valid_values_set.begin(), valid_values_set.end(), v); + if (iter==valid_values_set.end()) { return false; } + } + + return true; +} + +std::string option_int::get_default_string() const +{ + std::stringstream sstr; + sstr << default_value; + return sstr.str(); +} + + +std::string choice_option_base::getTypeDescr() const +{ + std::vector choices = get_choice_names(); + + std::stringstream sstr; + sstr << "{"; + + bool first=true; +#ifdef FOR_LOOP_AUTO_SUPPORT + FOR_LOOP(auto, c, choices) { +#else + FOR_LOOP(std::string, c, choices) { +#endif + if (first) { first=false; } + else { sstr << ","; } + + sstr << c; + } + + sstr << "}"; + return sstr.str(); +} + + +bool choice_option_base::processCmdLineArguments(char** argv, int* argc, int idx) +{ + if (argv==NULL) { return false; } + if (idx >= *argc) { return false; } + + std::string value = argv[idx]; + + std::cout << "set " << value << "\n"; + bool success = set_value(value); + std::cout << "success " << success << "\n"; + + remove_option(argc,argv,idx,1); + + return success; +} + + +static char* fill_strings_into_memory(const std::vector& strings_list) +{ + // calculate memory requirement + + int totalStringLengths = 0; +#ifdef FOR_LOOP_AUTO_SUPPORT + FOR_LOOP(auto, str, strings_list) { +#else + FOR_LOOP(std::string, str, strings_list) { +#endif + totalStringLengths += str.length() +1; // +1 for null termination + } + + int numStrings = strings_list.size(); + + int pointersSize = (numStrings+1) * sizeof(const char*); + + char* memory = new char[pointersSize + totalStringLengths]; + + + // copy strings to memory area + + char* stringPtr = memory + (numStrings+1) * sizeof(const char*); + const char** tablePtr = (const char**)memory; + +#ifdef FOR_LOOP_AUTO_SUPPORT + FOR_LOOP(auto, str, strings_list) { +#else + FOR_LOOP(std::string, str, strings_list) { +#endif + *tablePtr++ = stringPtr; + + strcpy(stringPtr, str.c_str()); + stringPtr += str.length()+1; + } + + *tablePtr = NULL; + + return memory; +} + + +const char** choice_option_base::get_choices_string_table() const +{ + if (choice_string_table==NULL) { + choice_string_table = fill_strings_into_memory(get_choice_names()); + } + + return (const char**)choice_string_table; +} + + + +bool config_parameters::parse_command_line_params(int* argc, char** argv, int* first_idx_ptr, + bool ignore_unknown_options) +{ + int first_idx=1; + if (first_idx_ptr) { first_idx = *first_idx_ptr; } + + for (int i=first_idx;i < *argc;i++) { + + if (argv[i][0]=='-') { + // option + + if (argv[i][1]=='-') { + // long option + + bool option_found=false; + + for (int o=0;ohasLongOption() && strcmp(mOptions[o]->getLongOption().c_str(), + argv[i]+2)==0) { + option_found=true; + + printf("FOUND %s\n",argv[i]); + + bool success = mOptions[o]->processCmdLineArguments(argv,argc, i+1); + if (!success) { + if (first_idx_ptr) { *first_idx_ptr = i; } + return false; + } + + remove_option(argc,argv,i); + i--; + + break; + } + } + + if (option_found == false && !ignore_unknown_options) { + return false; + } + } + else { + // short option + + bool is_single_option = (argv[i][1] != 0 && argv[i][2]==0); + bool do_remove_option = true; + + for (int n=1; argv[i][n]; n++) { + char option = argv[i][n]; + + bool option_found=false; + + for (int o=0;ogetShortOption() == option) { + option_found=true; + + bool success; + if (is_single_option) { + success = mOptions[o]->processCmdLineArguments(argv,argc, i+1); + } + else { + success = mOptions[o]->processCmdLineArguments(NULL,NULL, 0); + } + + if (!success) { + if (first_idx_ptr) { *first_idx_ptr = i; } + return false; + } + + break; + } + } + + if (!option_found) { + if (!ignore_unknown_options) { + fprintf(stderr, "unknown option -%c\n",option); + return false; + } + else { + do_remove_option=false; + } + } + + } // all short options + + if (do_remove_option) { + remove_option(argc,argv,i); + i--; + } + } // is short option + } // is option + } // all command line arguments + + return true; +} + + +void config_parameters::print_params() const +{ + for (int i=0;ihasShortOption()) { + sstr << '-' << o->getShortOption(); + } else { + sstr << " "; + } + + if (o->hasShortOption() && o->hasLongOption()) { + sstr << ", "; + } else { + sstr << " "; + } + + if (o->hasLongOption()) { + sstr << "--" << std::setw(12) << std::left << o->getLongOption(); + } else { + sstr << " "; + } + + sstr << " "; + sstr << o->getTypeDescr(); + + if (o->has_default()) { + sstr << ", default=" << o->get_default_string(); + } + + if (o->has_description()) { + sstr << " : " << o->get_description(); + } + + sstr << "\n"; + + std::cerr << sstr.str(); + } +} + + +void config_parameters::add_option(option_base* o) +{ + mOptions.push_back(o); + delete[] param_string_table; // delete old table, since we got a new parameter + param_string_table = NULL; +} + + +std::vector config_parameters::get_parameter_IDs() const +{ + std::vector ids; + +#ifdef FOR_LOOP_AUTO_SUPPORT + FOR_LOOP(auto, option, mOptions) { +#else + FOR_LOOP(option_base*, option, mOptions) { +#endif + ids.push_back(option->get_name()); + } + + return ids; +} + + +enum en265_parameter_type config_parameters::get_parameter_type(const char* param) const +{ + option_base* option = find_option(param); + assert(option); + + if (dynamic_cast (option)) { return en265_parameter_int; } + if (dynamic_cast (option)) { return en265_parameter_bool; } + if (dynamic_cast(option)) { return en265_parameter_string; } + if (dynamic_cast(option)) { return en265_parameter_choice; } + + assert(false); + return en265_parameter_bool; +} + + +std::vector config_parameters::get_parameter_choices(const char* param) const +{ + option_base* option = find_option(param); + assert(option); + + choice_option_base* o = dynamic_cast(option); + assert(o); + + return o->get_choice_names(); +} + + +option_base* config_parameters::find_option(const char* param) const +{ +#ifdef FOR_LOOP_AUTO_SUPPORT + FOR_LOOP(auto, o, mOptions) { +#else + FOR_LOOP(option_base*, o, mOptions) { +#endif + if (strcmp(o->get_name().c_str(), param)==0) { return o; } + } + + return NULL; +} + + +bool config_parameters::set_bool(const char* param, bool value) +{ + option_base* option = find_option(param); + assert(option); + + option_bool* o = dynamic_cast(option); + assert(o); + + return o->set(value); +} + +bool config_parameters::set_int(const char* param, int value) +{ + option_base* option = find_option(param); + assert(option); + + option_int* o = dynamic_cast(option); + assert(o); + + return o->set(value); +} + +bool config_parameters::set_string(const char* param, const char* value) +{ + option_base* option = find_option(param); + assert(option); + + option_string* o = dynamic_cast(option); + assert(o); + + return o->set(value); +} + +bool config_parameters::set_choice(const char* param, const char* value) +{ + option_base* option = find_option(param); + assert(option); + + choice_option_base* o = dynamic_cast(option); + assert(o); + + return o->set(value); +} + + + +const char** config_parameters::get_parameter_choices_table(const char* param) const +{ + option_base* option = find_option(param); + assert(option); + + choice_option_base* o = dynamic_cast(option); + assert(o); + + return o->get_choices_string_table(); +} + +const char** config_parameters::get_parameter_string_table() const +{ + if (param_string_table==NULL) { + param_string_table = fill_strings_into_memory(get_parameter_IDs()); + } + + return (const char**)param_string_table; +} diff --git a/contextmodel.cc b/contextmodel.cc new file mode 100644 index 0000000..ec43228 --- /dev/null +++ b/contextmodel.cc @@ -0,0 +1,347 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "slice.h" +#include +#include +#include + +bool D = false; + +context_model_table::context_model_table() + : model(NULL), refcnt(NULL) +{ +} + + +context_model_table::context_model_table(const context_model_table& src) +{ + if (D) printf("%p c'tor = %p\n",this,&src); + + if (src.refcnt) { + (*(src.refcnt))++; + } + + refcnt = src.refcnt; + model = src.model; +} + + +context_model_table::~context_model_table() +{ + if (D) printf("%p destructor\n",this); + + if (refcnt) { + (*refcnt)--; + if (*refcnt==0) { + if (D) printf("mfree %p\n",model); + delete[] model; + delete refcnt; + } + } +} + + +void context_model_table::init(int initType, int QPY) +{ + if (D) printf("%p init\n",this); + + decouple_or_alloc_with_empty_data(); + + initialize_CABAC_models(model, initType, QPY); +} + + +void context_model_table::release() +{ + if (D) printf("%p release %p\n",this,refcnt); + + if (!refcnt) { return; } + + // if (*refcnt == 1) { return; } <- keep memory for later, but does not work when we believe that we freed the memory and nulled all references + + (*refcnt)--; + if (*refcnt==0) { + delete[] model; + delete refcnt; + } + + model = nullptr; + refcnt= nullptr; +} + + +void context_model_table::decouple() +{ + if (D) printf("%p decouple (%p)\n",this,refcnt); + + assert(refcnt); // not necessarily so, but we never use it on an unitialized object + + if (*refcnt > 1) { + (*refcnt)--; + + context_model* oldModel = model; + + model = new context_model[CONTEXT_MODEL_TABLE_LENGTH]; + refcnt= new int; + *refcnt=1; + + memcpy(model,oldModel,sizeof(context_model)*CONTEXT_MODEL_TABLE_LENGTH); + } +} + + +context_model_table context_model_table::transfer() +{ + context_model_table newtable; + newtable.model = model; + newtable.refcnt= refcnt; + + model =nullptr; + refcnt=nullptr; + + return newtable; +} + + +context_model_table& context_model_table::operator=(const context_model_table& src) +{ + if (D) printf("%p assign = %p\n",this,&src); + + // assert(src.refcnt); // not necessarily so, but we never use it on an unitialized object + + if (!src.refcnt) { + release(); + return *this; + } + + (*(src.refcnt))++; + + release(); + + model = src.model; + refcnt= src.refcnt; + + return *this; +} + + +bool context_model_table::operator==(const context_model_table& b) const +{ + if (b.model == model) return true; + if (b.model == nullptr || model == nullptr) return false; + + for (int i=0;i1); + (*refcnt)--; + } + + if (D) printf("%p (alloc)\n",this); + + model = new context_model[CONTEXT_MODEL_TABLE_LENGTH]; + refcnt= new int; + *refcnt=1; +} + + + + + + +static void set_initValue(int SliceQPY, + context_model* model, int initValue, int nContexts) +{ + int slopeIdx = initValue >> 4; + int intersecIdx = initValue & 0xF; + int m = slopeIdx*5 - 45; + int n = (intersecIdx<<3) - 16; + int preCtxState = Clip3(1,126, ((m*Clip3(0,51, SliceQPY))>>4)+n); + + // logtrace(LogSlice,"QP=%d slopeIdx=%d intersecIdx=%d m=%d n=%d\n",SliceQPY,slopeIdx,intersecIdx,m,n); + + for (int i=0;i 0) { + init_context(QPY, cm+CONTEXT_MODEL_CU_SKIP_FLAG, initValue_cu_skip_flag[initType-1], 3); + init_context(QPY, cm+CONTEXT_MODEL_PRED_MODE_FLAG, &initValue_pred_mode_flag[initType-1], 1); + init_context(QPY, cm+CONTEXT_MODEL_MERGE_FLAG, &initValue_merge_flag[initType-1],1); + init_context(QPY, cm+CONTEXT_MODEL_MERGE_IDX, &initValue_merge_idx[initType-1], 1); + init_context(QPY, cm+CONTEXT_MODEL_INTER_PRED_IDC, initValue_inter_pred_idc, 5); + init_context(QPY, cm+CONTEXT_MODEL_REF_IDX_LX, initValue_ref_idx_lX, 2); + init_context(QPY, cm+CONTEXT_MODEL_ABS_MVD_GREATER01_FLAG, &initValue_abs_mvd_greater01_flag[initType == 1 ? 0 : 2], 2); + init_context(QPY, cm+CONTEXT_MODEL_MVP_LX_FLAG, initValue_mvp_lx_flag, 1); + init_context(QPY, cm+CONTEXT_MODEL_RQT_ROOT_CBF, initValue_rqt_root_cbf, 1); + + init_context_const(QPY, cm+CONTEXT_MODEL_RDPCM_FLAG, 139, 2); + init_context_const(QPY, cm+CONTEXT_MODEL_RDPCM_DIR, 139, 2); + } + + init_context(QPY, cm+CONTEXT_MODEL_SPLIT_CU_FLAG, initValue_split_cu_flag[initType], 3); + init_context(QPY, cm+CONTEXT_MODEL_PART_MODE, &initValue_part_mode[(initType!=2 ? initType : 5)], 4); + init_context(QPY, cm+CONTEXT_MODEL_PREV_INTRA_LUMA_PRED_FLAG, &initValue_prev_intra_luma_pred_flag[initType], 1); + init_context(QPY, cm+CONTEXT_MODEL_INTRA_CHROMA_PRED_MODE, &initValue_intra_chroma_pred_mode[initType], 1); + init_context(QPY, cm+CONTEXT_MODEL_CBF_LUMA, &initValue_cbf_luma[initType == 0 ? 0 : 2], 2); + init_context(QPY, cm+CONTEXT_MODEL_CBF_CHROMA, &initValue_cbf_chroma[initType * 4], 4); + init_context(QPY, cm+CONTEXT_MODEL_SPLIT_TRANSFORM_FLAG, &initValue_split_transform_flag[initType * 3], 3); + init_context(QPY, cm+CONTEXT_MODEL_LAST_SIGNIFICANT_COEFFICIENT_X_PREFIX, &initValue_last_significant_coefficient_prefix[initType * 18], 18); + init_context(QPY, cm+CONTEXT_MODEL_LAST_SIGNIFICANT_COEFFICIENT_Y_PREFIX, &initValue_last_significant_coefficient_prefix[initType * 18], 18); + init_context(QPY, cm+CONTEXT_MODEL_CODED_SUB_BLOCK_FLAG, &initValue_coded_sub_block_flag[initType * 4], 4); + init_context(QPY, cm+CONTEXT_MODEL_SIGNIFICANT_COEFF_FLAG, initValue_significant_coeff_flag[initType], 42); + init_context(QPY, cm+CONTEXT_MODEL_SIGNIFICANT_COEFF_FLAG+42, initValue_significant_coeff_flag_skipmode[initType], 2); + + init_context(QPY, cm+CONTEXT_MODEL_COEFF_ABS_LEVEL_GREATER1_FLAG, &initValue_coeff_abs_level_greater1_flag[initType * 24], 24); + init_context(QPY, cm+CONTEXT_MODEL_COEFF_ABS_LEVEL_GREATER2_FLAG, &initValue_coeff_abs_level_greater2_flag[initType * 6], 6); + init_context(QPY, cm+CONTEXT_MODEL_SAO_MERGE_FLAG, &initValue_sao_merge_leftUp_flag[initType], 1); + init_context(QPY, cm+CONTEXT_MODEL_SAO_TYPE_IDX, &initValue_sao_type_idx_lumaChroma_flag[initType], 1); + init_context(QPY, cm+CONTEXT_MODEL_CU_QP_DELTA_ABS, initValue_cu_qp_delta_abs, 2); + init_context(QPY, cm+CONTEXT_MODEL_TRANSFORM_SKIP_FLAG, initValue_transform_skip_flag, 2); + init_context(QPY, cm+CONTEXT_MODEL_CU_TRANSQUANT_BYPASS_FLAG, &initValue_cu_transquant_bypass_flag[initType], 1); + + init_context_const(QPY, cm+CONTEXT_MODEL_LOG2_RES_SCALE_ABS_PLUS1, 154, 8); + init_context_const(QPY, cm+CONTEXT_MODEL_RES_SCALE_SIGN_FLAG, 154, 2); + init_context_const(QPY, cm+CONTEXT_MODEL_CU_CHROMA_QP_OFFSET_FLAG, 154, 1); + init_context_const(QPY, cm+CONTEXT_MODEL_CU_CHROMA_QP_OFFSET_IDX, 154, 1); +} diff --git a/de265.cc b/de265.cc new file mode 100644 index 0000000..75dd0a8 --- /dev/null +++ b/de265.cc @@ -0,0 +1,711 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#define DEBUG_INSERT_STREAM_ERRORS 0 + + +#include "de265.h" +#include "decctx.h" +#include "util.h" +#include "scan.h" +#include "image.h" +#include "sei.h" + +#include +#include +#include +#include + + +// TODO: should be in some vps.c related header +de265_error read_vps(decoder_context* ctx, bitreader* reader, video_parameter_set* vps); + +extern "C" { +LIBDE265_API const char *de265_get_version(void) +{ + return (LIBDE265_VERSION); +} + +LIBDE265_API uint32_t de265_get_version_number(void) +{ + return (LIBDE265_NUMERIC_VERSION); +} + +LIBDE265_API int de265_get_version_number_major(void) +{ + return ((LIBDE265_NUMERIC_VERSION)>>24) & 0xFF; +} + +LIBDE265_API int de265_get_version_number_minor(void) +{ + return ((LIBDE265_NUMERIC_VERSION)>>16) & 0xFF; +} + +LIBDE265_API int de265_get_version_number_maintenance(void) +{ + return ((LIBDE265_NUMERIC_VERSION)>>8) & 0xFF; +} + + +LIBDE265_API const char* de265_get_error_text(de265_error err) +{ + switch (err) { + case DE265_OK: return "no error"; + case DE265_ERROR_NO_SUCH_FILE: return "no such file"; + //case DE265_ERROR_NO_STARTCODE: return "no startcode found"; + //case DE265_ERROR_EOF: return "end of file"; + case DE265_ERROR_COEFFICIENT_OUT_OF_IMAGE_BOUNDS: return "coefficient out of image bounds"; + case DE265_ERROR_CHECKSUM_MISMATCH: return "image checksum mismatch"; + case DE265_ERROR_CTB_OUTSIDE_IMAGE_AREA: return "CTB outside of image area"; + case DE265_ERROR_OUT_OF_MEMORY: return "out of memory"; + case DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE: return "coded parameter out of range"; + case DE265_ERROR_IMAGE_BUFFER_FULL: return "DPB/output queue full"; + case DE265_ERROR_CANNOT_START_THREADPOOL: return "cannot start decoding threads"; + case DE265_ERROR_LIBRARY_INITIALIZATION_FAILED: return "global library initialization failed"; + case DE265_ERROR_LIBRARY_NOT_INITIALIZED: return "cannot free library data (not initialized"; + + //case DE265_ERROR_MAX_THREAD_CONTEXTS_EXCEEDED: + // return "internal error: maximum number of thread contexts exceeded"; + //case DE265_ERROR_MAX_NUMBER_OF_SLICES_EXCEEDED: + // return "internal error: maximum number of slices exceeded"; + case DE265_ERROR_NOT_IMPLEMENTED_YET: + return "unimplemented decoder feature"; + //case DE265_ERROR_SCALING_LIST_NOT_IMPLEMENTED: + //return "scaling list not implemented"; + + case DE265_ERROR_WAITING_FOR_INPUT_DATA: + return "no more input data, decoder stalled"; + case DE265_ERROR_CANNOT_PROCESS_SEI: + return "SEI data cannot be processed"; + case DE265_ERROR_PARAMETER_PARSING: + return "command-line parameter error"; + case DE265_ERROR_NO_INITIAL_SLICE_HEADER: + return "first slice missing, cannot decode dependent slice"; + case DE265_ERROR_PREMATURE_END_OF_SLICE: + return "premature end of slice data"; + case DE265_ERROR_UNSPECIFIED_DECODING_ERROR: + return "unspecified decoding error"; + + case DE265_WARNING_NO_WPP_CANNOT_USE_MULTITHREADING: + return "Cannot run decoder multi-threaded because stream does not support WPP"; + case DE265_WARNING_WARNING_BUFFER_FULL: + return "Too many warnings queued"; + case DE265_WARNING_PREMATURE_END_OF_SLICE_SEGMENT: + return "Premature end of slice segment"; + case DE265_WARNING_INCORRECT_ENTRY_POINT_OFFSET: + return "Incorrect entry-point offsets"; + case DE265_WARNING_CTB_OUTSIDE_IMAGE_AREA: + return "CTB outside of image area (concealing stream error...)"; + case DE265_WARNING_SPS_HEADER_INVALID: + return "sps header invalid"; + case DE265_WARNING_PPS_HEADER_INVALID: + return "pps header invalid"; + case DE265_WARNING_SLICEHEADER_INVALID: + return "slice header invalid"; + case DE265_WARNING_INCORRECT_MOTION_VECTOR_SCALING: + return "impossible motion vector scaling"; + case DE265_WARNING_NONEXISTING_PPS_REFERENCED: + return "non-existing PPS referenced"; + case DE265_WARNING_NONEXISTING_SPS_REFERENCED: + return "non-existing SPS referenced"; + case DE265_WARNING_BOTH_PREDFLAGS_ZERO: + return "both predFlags[] are zero in MC"; + case DE265_WARNING_NONEXISTING_REFERENCE_PICTURE_ACCESSED: + return "non-existing reference picture accessed"; + case DE265_WARNING_NUMMVP_NOT_EQUAL_TO_NUMMVQ: + return "numMV_P != numMV_Q in deblocking"; + case DE265_WARNING_NUMBER_OF_SHORT_TERM_REF_PIC_SETS_OUT_OF_RANGE: + return "number of short-term ref-pic-sets out of range"; + case DE265_WARNING_SHORT_TERM_REF_PIC_SET_OUT_OF_RANGE: + return "short-term ref-pic-set index out of range"; + case DE265_WARNING_FAULTY_REFERENCE_PICTURE_LIST: + return "faulty reference picture list"; + case DE265_WARNING_EOSS_BIT_NOT_SET: + return "end_of_sub_stream_one_bit not set to 1 when it should be"; + case DE265_WARNING_MAX_NUM_REF_PICS_EXCEEDED: + return "maximum number of reference pictures exceeded"; + case DE265_WARNING_INVALID_CHROMA_FORMAT: + return "invalid chroma format in SPS header"; + case DE265_WARNING_SLICE_SEGMENT_ADDRESS_INVALID: + return "slice segment address invalid"; + case DE265_WARNING_DEPENDENT_SLICE_WITH_ADDRESS_ZERO: + return "dependent slice with address 0"; + case DE265_WARNING_NUMBER_OF_THREADS_LIMITED_TO_MAXIMUM: + return "number of threads limited to maximum amount"; + case DE265_NON_EXISTING_LT_REFERENCE_CANDIDATE_IN_SLICE_HEADER: + return "non-existing long-term reference candidate specified in slice header"; + case DE265_WARNING_CANNOT_APPLY_SAO_OUT_OF_MEMORY: + return "cannot apply SAO because we ran out of memory"; + case DE265_WARNING_SPS_MISSING_CANNOT_DECODE_SEI: + return "SPS header missing, cannot decode SEI"; + case DE265_WARNING_COLLOCATED_MOTION_VECTOR_OUTSIDE_IMAGE_AREA: + return "collocated motion-vector is outside image area"; + + default: return "unknown error"; + } +} + +LIBDE265_API int de265_isOK(de265_error err) +{ + return err == DE265_OK || err >= 1000; +} + + + +static int de265_init_count; + +static std::mutex de265_init_mutex; + + +LIBDE265_API de265_error de265_init() +{ + std::lock_guard lock(de265_init_mutex); + + de265_init_count++; + + if (de265_init_count > 1) { + // we are not the first -> already initialized + + return DE265_OK; + } + + + // do initializations + + init_scan_orders(); + + if (!alloc_and_init_significant_coeff_ctxIdx_lookupTable()) { + de265_init_count--; + return DE265_ERROR_LIBRARY_INITIALIZATION_FAILED; + } + + return DE265_OK; +} + +LIBDE265_API de265_error de265_free() +{ + std::lock_guard lock(de265_init_mutex); + + if (de265_init_count<=0) { + return DE265_ERROR_LIBRARY_NOT_INITIALIZED; + } + + de265_init_count--; + + if (de265_init_count==0) { + free_significant_coeff_ctxIdx_lookupTable(); + } + + return DE265_OK; +} + + +LIBDE265_API de265_decoder_context* de265_new_decoder() +{ + de265_error init_err = de265_init(); + if (init_err != DE265_OK) { + return NULL; + } + + decoder_context* ctx = new decoder_context; + if (!ctx) { + de265_free(); + return NULL; + } + + return (de265_decoder_context*)ctx; +} + + +LIBDE265_API de265_error de265_free_decoder(de265_decoder_context* de265ctx) +{ + decoder_context* ctx = (decoder_context*)de265ctx; + + ctx->stop_thread_pool(); + + delete ctx; + + return de265_free(); +} + + +LIBDE265_API de265_error de265_start_worker_threads(de265_decoder_context* de265ctx, int number_of_threads) +{ + decoder_context* ctx = (decoder_context*)de265ctx; + + if (number_of_threads > MAX_THREADS) { + number_of_threads = MAX_THREADS; + } + + if (number_of_threads>0) { + de265_error err = ctx->start_thread_pool(number_of_threads); + if (de265_isOK(err)) { + err = DE265_OK; + } + return err; + } + else { + return DE265_OK; + } +} + + +#ifndef LIBDE265_DISABLE_DEPRECATED +LIBDE265_API de265_error de265_decode_data(de265_decoder_context* de265ctx, + const void* data8, int len) +{ + //decoder_context* ctx = (decoder_context*)de265ctx; + de265_error err; + if (len > 0) { + err = de265_push_data(de265ctx, data8, len, 0, NULL); + } else { + err = de265_flush_data(de265ctx); + } + if (err != DE265_OK) { + return err; + } + + int more = 0; + do { + err = de265_decode(de265ctx, &more); + if (err != DE265_OK) { + more = 0; + } + + switch (err) { + case DE265_ERROR_WAITING_FOR_INPUT_DATA: + // ignore error (didn't exist in 0.4 and before) + err = DE265_OK; + break; + default: + break; + } + } while (more); + return err; +} +#endif + +static void dumpdata(const void* data, int len) +{ + for (int i=0;inal_parser.push_data(data,len,pts,user_data); +} + + +LIBDE265_API de265_error de265_push_NAL(de265_decoder_context* de265ctx, + const void* data8, int len, + de265_PTS pts, void* user_data) +{ + decoder_context* ctx = (decoder_context*)de265ctx; + uint8_t* data = (uint8_t*)data8; + + //printf("push NAL (size %d)\n",len); + //dumpdata(data8,16); + + return ctx->nal_parser.push_NAL(data,len,pts,user_data); +} + + +LIBDE265_API de265_error de265_decode(de265_decoder_context* de265ctx, int* more) +{ + decoder_context* ctx = (decoder_context*)de265ctx; + + return ctx->decode(more); +} + + +LIBDE265_API void de265_push_end_of_NAL(de265_decoder_context* de265ctx) +{ + decoder_context* ctx = (decoder_context*)de265ctx; + + ctx->nal_parser.flush_data(); +} + + +LIBDE265_API void de265_push_end_of_frame(de265_decoder_context* de265ctx) +{ + de265_push_end_of_NAL(de265ctx); + + decoder_context* ctx = (decoder_context*)de265ctx; + ctx->nal_parser.mark_end_of_frame(); +} + + +LIBDE265_API de265_error de265_flush_data(de265_decoder_context* de265ctx) +{ + de265_push_end_of_NAL(de265ctx); + + decoder_context* ctx = (decoder_context*)de265ctx; + + ctx->nal_parser.flush_data(); + ctx->nal_parser.mark_end_of_stream(); + + return DE265_OK; +} + + +LIBDE265_API void de265_reset(de265_decoder_context* de265ctx) +{ + decoder_context* ctx = (decoder_context*)de265ctx; + + //printf("--- reset ---\n"); + + ctx->reset(); +} + + +LIBDE265_API const struct de265_image* de265_get_next_picture(de265_decoder_context* de265ctx) +{ + const struct de265_image* img = de265_peek_next_picture(de265ctx); + if (img) { + de265_release_next_picture(de265ctx); + } + + return img; +} + + +LIBDE265_API const struct de265_image* de265_peek_next_picture(de265_decoder_context* de265ctx) +{ + decoder_context* ctx = (decoder_context*)de265ctx; + + if (ctx->num_pictures_in_output_queue()>0) { + de265_image* img = ctx->get_next_picture_in_output_queue(); + return img; + } + else { + return NULL; + } +} + + +LIBDE265_API void de265_release_next_picture(de265_decoder_context* de265ctx) +{ + decoder_context* ctx = (decoder_context*)de265ctx; + + // no active output picture -> ignore release request + + if (ctx->num_pictures_in_output_queue()==0) { return; } + + de265_image* next_image = ctx->get_next_picture_in_output_queue(); + + loginfo(LogDPB, "release DPB with POC=%d\n",next_image->PicOrderCntVal); + + next_image->PicOutputFlag = false; + + // TODO: actually, we want to release it here, but we cannot without breaking API + // compatibility, because get_next_picture calls this immediately. Hence, we release + // images while scanning for available slots in the DPB. + // if (next_image->can_be_released()) { next_image->release(); } + + // pop output queue + + ctx->pop_next_picture_in_output_queue(); +} + + + +LIBDE265_API int de265_get_highest_TID(de265_decoder_context* de265ctx) +{ + decoder_context* ctx = (decoder_context*)de265ctx; + return ctx->get_highest_TID(); +} + +LIBDE265_API int de265_get_current_TID(de265_decoder_context* de265ctx) +{ + decoder_context* ctx = (decoder_context*)de265ctx; + return ctx->get_current_TID(); +} + +LIBDE265_API void de265_set_limit_TID(de265_decoder_context* de265ctx,int max_tid) +{ + decoder_context* ctx = (decoder_context*)de265ctx; + ctx->set_limit_TID(max_tid); +} + +LIBDE265_API void de265_set_framerate_ratio(de265_decoder_context* de265ctx,int percent) +{ + decoder_context* ctx = (decoder_context*)de265ctx; + ctx->set_framerate_ratio(percent); +} + +LIBDE265_API int de265_change_framerate(de265_decoder_context* de265ctx,int more) +{ + decoder_context* ctx = (decoder_context*)de265ctx; + return ctx->change_framerate(more); +} + + +LIBDE265_API de265_error de265_get_warning(de265_decoder_context* de265ctx) +{ + decoder_context* ctx = (decoder_context*)de265ctx; + + return ctx->get_warning(); +} + +LIBDE265_API void de265_set_parameter_bool(de265_decoder_context* de265ctx, enum de265_param param, int value) +{ + decoder_context* ctx = (decoder_context*)de265ctx; + + switch (param) + { + case DE265_DECODER_PARAM_BOOL_SEI_CHECK_HASH: + ctx->param_sei_check_hash = !!value; + break; + + case DE265_DECODER_PARAM_SUPPRESS_FAULTY_PICTURES: + ctx->param_suppress_faulty_pictures = !!value; + break; + + case DE265_DECODER_PARAM_DISABLE_DEBLOCKING: + ctx->param_disable_deblocking = !!value; + break; + + case DE265_DECODER_PARAM_DISABLE_SAO: + ctx->param_disable_sao = !!value; + break; + + /* + case DE265_DECODER_PARAM_DISABLE_MC_RESIDUAL_IDCT: + ctx->param_disable_mc_residual_idct = !!value; + break; + + case DE265_DECODER_PARAM_DISABLE_INTRA_RESIDUAL_IDCT: + ctx->param_disable_intra_residual_idct = !!value; + break; + */ + + default: + assert(false); + break; + } +} + + +LIBDE265_API void de265_set_parameter_int(de265_decoder_context* de265ctx, enum de265_param param, int value) +{ + decoder_context* ctx = (decoder_context*)de265ctx; + + switch (param) + { + case DE265_DECODER_PARAM_DUMP_SPS_HEADERS: + ctx->param_sps_headers_fd = value; + break; + + case DE265_DECODER_PARAM_DUMP_VPS_HEADERS: + ctx->param_vps_headers_fd = value; + break; + + case DE265_DECODER_PARAM_DUMP_PPS_HEADERS: + ctx->param_pps_headers_fd = value; + break; + + case DE265_DECODER_PARAM_DUMP_SLICE_HEADERS: + ctx->param_slice_headers_fd = value; + break; + + case DE265_DECODER_PARAM_ACCELERATION_CODE: + ctx->set_acceleration_functions((enum de265_acceleration)value); + break; + + default: + assert(false); + break; + } +} + + + + +LIBDE265_API int de265_get_parameter_bool(de265_decoder_context* de265ctx, enum de265_param param) +{ + decoder_context* ctx = (decoder_context*)de265ctx; + + switch (param) + { + case DE265_DECODER_PARAM_BOOL_SEI_CHECK_HASH: + return ctx->param_sei_check_hash; + + case DE265_DECODER_PARAM_SUPPRESS_FAULTY_PICTURES: + return ctx->param_suppress_faulty_pictures; + + case DE265_DECODER_PARAM_DISABLE_DEBLOCKING: + return ctx->param_disable_deblocking; + + case DE265_DECODER_PARAM_DISABLE_SAO: + return ctx->param_disable_sao; + + /* + case DE265_DECODER_PARAM_DISABLE_MC_RESIDUAL_IDCT: + return ctx->param_disable_mc_residual_idct; + + case DE265_DECODER_PARAM_DISABLE_INTRA_RESIDUAL_IDCT: + return ctx->param_disable_intra_residual_idct; + */ + + default: + assert(false); + return false; + } +} + + +LIBDE265_API int de265_get_number_of_input_bytes_pending(de265_decoder_context* de265ctx) +{ + decoder_context* ctx = (decoder_context*)de265ctx; + + return ctx->nal_parser.bytes_in_input_queue(); +} + + +LIBDE265_API int de265_get_number_of_NAL_units_pending(de265_decoder_context* de265ctx) +{ + decoder_context* ctx = (decoder_context*)de265ctx; + + return ctx->nal_parser.number_of_NAL_units_pending(); +} + + +LIBDE265_API int de265_get_image_width(const struct de265_image* img,int channel) +{ + switch (channel) { + case 0: + return img->width_confwin; + case 1: + case 2: + return img->chroma_width_confwin; + default: + return 0; + } +} + +LIBDE265_API int de265_get_image_height(const struct de265_image* img,int channel) +{ + switch (channel) { + case 0: + return img->height_confwin; + case 1: + case 2: + return img->chroma_height_confwin; + default: + return 0; + } +} + +LIBDE265_API int de265_get_bits_per_pixel(const struct de265_image* img,int channel) +{ + switch (channel) { + case 0: + return img->get_sps().BitDepth_Y; + case 1: + case 2: + return img->get_sps().BitDepth_C; + default: + return 0; + } +} + +LIBDE265_API enum de265_chroma de265_get_chroma_format(const struct de265_image* img) +{ + return img->get_chroma_format(); +} + +LIBDE265_API const uint8_t* de265_get_image_plane(const de265_image* img, int channel, int* stride) +{ + assert(channel>=0 && channel <= 2); + + uint8_t* data = img->pixels_confwin[channel]; + + if (stride) *stride = img->get_image_stride(channel) * ((de265_get_bits_per_pixel(img, channel)+7) / 8); + + return data; +} + +LIBDE265_API void *de265_get_image_plane_user_data(const struct de265_image* img, int channel) +{ + assert(channel>=0 && channel <= 2); + + return img->plane_user_data[channel]; +} + +LIBDE265_API void de265_set_image_plane(de265_image* img, int cIdx, void* mem, int stride, void *userdata) +{ + // The internal "stride" is the number of pixels per line. + stride = stride / ((de265_get_bits_per_pixel(img, cIdx)+7) / 8); + img->set_image_plane(cIdx, (uint8_t*)mem, stride, userdata); +} + +LIBDE265_API void de265_set_image_allocation_functions(de265_decoder_context* de265ctx, + de265_image_allocation* allocfunc, + void* userdata) +{ + decoder_context* ctx = (decoder_context*)de265ctx; + + ctx->set_image_allocation_functions(allocfunc, userdata); +} + +LIBDE265_API const struct de265_image_allocation *de265_get_default_image_allocation_functions(void) +{ + return &de265_image::default_image_allocation; +} + +LIBDE265_API de265_PTS de265_get_image_PTS(const struct de265_image* img) +{ + return img->pts; +} + +LIBDE265_API void* de265_get_image_user_data(const struct de265_image* img) +{ + return img->user_data; +} + +LIBDE265_API void de265_set_image_user_data(struct de265_image* img, void *user_data) +{ + img->user_data = user_data; +} + +LIBDE265_API void de265_get_image_NAL_header(const struct de265_image* img, + int* nal_unit_type, + const char** nal_unit_name, + int* nuh_layer_id, + int* nuh_temporal_id) +{ + if (nal_unit_type) *nal_unit_type = img->nal_hdr.nal_unit_type; + if (nal_unit_name) *nal_unit_name = get_NAL_name(img->nal_hdr.nal_unit_type); + if (nuh_layer_id) *nuh_layer_id = img->nal_hdr.nuh_layer_id; + if (nuh_temporal_id) *nuh_temporal_id = img->nal_hdr.nuh_temporal_id; +} +} diff --git a/deblock.cc b/deblock.cc new file mode 100644 index 0000000..f64cd8e --- /dev/null +++ b/deblock.cc @@ -0,0 +1,1058 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "deblock.h" +#include "util.h" +#include "transform.h" +#include "de265.h" + +#include + + + +// 8.7.2.1 for both EDGE_HOR and EDGE_VER at the same time +void markTransformBlockBoundary(de265_image* img, int x0,int y0, + int log2TrafoSize,int trafoDepth, + int filterLeftCbEdge, int filterTopCbEdge) +{ + logtrace(LogDeblock,"markTransformBlockBoundary(%d,%d, %d,%d, %d,%d)\n",x0,y0, + log2TrafoSize,trafoDepth, filterLeftCbEdge,filterTopCbEdge); + + int split_transform = img->get_split_transform_flag(x0,y0,trafoDepth); + if (split_transform) { + int x1 = x0 + ((1<>1); + int y1 = y0 + ((1<>1); + + markTransformBlockBoundary(img,x0,y0,log2TrafoSize-1,trafoDepth+1, filterLeftCbEdge, filterTopCbEdge); + markTransformBlockBoundary(img,x1,y0,log2TrafoSize-1,trafoDepth+1, DEBLOCK_FLAG_VERTI, filterTopCbEdge); + markTransformBlockBoundary(img,x0,y1,log2TrafoSize-1,trafoDepth+1, filterLeftCbEdge, DEBLOCK_FLAG_HORIZ); + markTransformBlockBoundary(img,x1,y1,log2TrafoSize-1,trafoDepth+1, DEBLOCK_FLAG_VERTI, DEBLOCK_FLAG_HORIZ); + } + else { + // VER + + for (int k=0;k<(1<set_deblk_flags(x0,y0+k, filterLeftCbEdge); + } + + // HOR + + for (int k=0;k<(1<set_deblk_flags(x0+k,y0, filterTopCbEdge); + } + } +} + + + +// 8.7.2.2 for both EDGE_HOR and EDGE_VER at the same time +void markPredictionBlockBoundary(de265_image* img, int x0,int y0, + int log2CbSize, + int filterLeftCbEdge, int filterTopCbEdge) +{ + logtrace(LogDeblock,"markPredictionBlockBoundary(%d,%d, %d, %d,%d)\n",x0,y0, + log2CbSize, filterLeftCbEdge,filterTopCbEdge); + + enum PartMode partMode = img->get_PartMode(x0,y0); + + int cbSize = 1<set_deblk_flags(x0+cbSize2,y0+k, DEBLOCK_PB_EDGE_VERTI); + img->set_deblk_flags(x0+k,y0+cbSize2, DEBLOCK_PB_EDGE_HORIZ); + } + break; + + case PART_Nx2N: + for (int k=0;kset_deblk_flags(x0+cbSize2,y0+k, DEBLOCK_PB_EDGE_VERTI); + } + break; + + case PART_2NxN: + for (int k=0;kset_deblk_flags(x0+k,y0+cbSize2, DEBLOCK_PB_EDGE_HORIZ); + } + break; + + case PART_nLx2N: + for (int k=0;kset_deblk_flags(x0+cbSize4,y0+k, DEBLOCK_PB_EDGE_VERTI); + } + break; + + case PART_nRx2N: + for (int k=0;kset_deblk_flags(x0+cbSize2+cbSize4,y0+k, DEBLOCK_PB_EDGE_VERTI); + } + break; + + case PART_2NxnU: + for (int k=0;kset_deblk_flags(x0+k,y0+cbSize4, DEBLOCK_PB_EDGE_HORIZ); + } + break; + + case PART_2NxnD: + for (int k=0;kset_deblk_flags(x0+k,y0+cbSize2+cbSize4, DEBLOCK_PB_EDGE_HORIZ); + } + break; + + case PART_2Nx2N: + // NOP + break; + } +} + + +bool derive_edgeFlags_CTBRow(de265_image* img, int ctby) +{ + const seq_parameter_set& sps = img->get_sps(); + const pic_parameter_set& pps = img->get_pps(); + + const int minCbSize = sps.MinCbSizeY; + bool deblocking_enabled=false; // whether deblocking is enabled in some part of the image + + int ctb_mask = (1<> sps.Log2MinCbSizeY; + int cb_y_end = ((ctby+1) << sps.Log2CtbSizeY) >> sps.Log2MinCbSizeY; + + cb_y_end = std::min(cb_y_end, sps.PicHeightInMinCbsY); + + for (int cb_y=cb_y_start;cb_yget_sps().PicWidthInMinCbsY;cb_x++) + { + int log2CbSize = img->get_log2CbSize_cbUnits(cb_x,cb_y); + if (log2CbSize==0) { + continue; + } + + // we are now at the top corner of a CB + + int x0 = cb_x * minCbSize; + int y0 = cb_y * minCbSize; + + int x0ctb = x0 >> ctbshift; + int y0ctb = y0 >> ctbshift; + + // check for corrupted streams + if (img->is_SliceHeader_available(x0,y0)==false) { + return false; + } + + // check whether we should filter this slice + + slice_segment_header* shdr = img->get_SliceHeader(x0,y0); + + // check whether to filter left and top edge + + uint8_t filterLeftCbEdge = DEBLOCK_FLAG_VERTI; + uint8_t filterTopCbEdge = DEBLOCK_FLAG_HORIZ; + if (x0 == 0) filterLeftCbEdge = 0; + if (y0 == 0) filterTopCbEdge = 0; + + // check for slice and tile boundaries (8.7.2, step 2 in both processes) + + if (x0 && ((x0 & ctb_mask) == 0)) { // left edge at CTB boundary + if (shdr->slice_loop_filter_across_slices_enabled_flag == 0 && + img->is_SliceHeader_available(x0-1,y0) && // for corrupted streams + shdr->SliceAddrRS != img->get_SliceHeader(x0-1,y0)->SliceAddrRS) + { + filterLeftCbEdge = 0; + } + else if (pps.loop_filter_across_tiles_enabled_flag == 0 && + pps.TileIdRS[ x0ctb +y0ctb*picWidthInCtbs] != + pps.TileIdRS[((x0-1)>>ctbshift)+y0ctb*picWidthInCtbs]) { + filterLeftCbEdge = 0; + } + } + + if (y0 && ((y0 & ctb_mask) == 0)) { // top edge at CTB boundary + if (shdr->slice_loop_filter_across_slices_enabled_flag == 0 && + img->is_SliceHeader_available(x0,y0-1) && // for corrupted streams + shdr->SliceAddrRS != img->get_SliceHeader(x0,y0-1)->SliceAddrRS) + { + filterTopCbEdge = 0; + } + else if (pps.loop_filter_across_tiles_enabled_flag == 0 && + pps.TileIdRS[x0ctb+ y0ctb *picWidthInCtbs] != + pps.TileIdRS[x0ctb+((y0-1)>>ctbshift)*picWidthInCtbs]) { + filterTopCbEdge = 0; + } + } + + + // mark edges + + if (shdr->slice_deblocking_filter_disabled_flag==0) { + deblocking_enabled=true; + + markTransformBlockBoundary(img, x0,y0, log2CbSize,0, + filterLeftCbEdge, filterTopCbEdge); + + markPredictionBlockBoundary(img, x0,y0, log2CbSize, + filterLeftCbEdge, filterTopCbEdge); + } + } + + return deblocking_enabled; +} + + +bool derive_edgeFlags(de265_image* img) +{ + bool deblocking_enabled=false; + + for (int y=0;yget_sps().PicHeightInCtbsY;y++) { + deblocking_enabled |= derive_edgeFlags_CTBRow(img,y); + } + + return deblocking_enabled; +} + + +// 8.7.2.3 (both, EDGE_VER and EDGE_HOR) +void derive_boundaryStrength(de265_image* img, bool vertical, int yStart,int yEnd, + int xStart,int xEnd) +{ + int xIncr = vertical ? 2 : 1; + int yIncr = vertical ? 1 : 2; + int xOffs = vertical ? 1 : 0; + int yOffs = vertical ? 0 : 1; + int edgeMask = vertical ? + (DEBLOCK_FLAG_VERTI | DEBLOCK_PB_EDGE_VERTI) : + (DEBLOCK_FLAG_HORIZ | DEBLOCK_PB_EDGE_HORIZ); + int transformEdgeMask = vertical ? DEBLOCK_FLAG_VERTI : DEBLOCK_FLAG_HORIZ; + + xEnd = libde265_min(xEnd,img->get_deblk_width()); + yEnd = libde265_min(yEnd,img->get_deblk_height()); + + int TUShift = img->get_sps().Log2MinTrafoSize; + int TUStride= img->get_sps().PicWidthInTbsY; + + for (int y=yStart;yget_deblk_flags(xDi,yDi) & edgeMask) ? "edge" : "..."); + + uint8_t edgeFlags = img->get_deblk_flags(xDi,yDi); + + if (edgeFlags & edgeMask) { + bool p_is_intra_pred = (img->get_pred_mode(xDi-xOffs, yDi-yOffs) == MODE_INTRA); + bool q_is_intra_pred = (img->get_pred_mode(xDi, yDi ) == MODE_INTRA); + + int bS; + + if (p_is_intra_pred || q_is_intra_pred) { + bS = 2; + } + else { + // opposing site + int xDiOpp = xDi-xOffs; + int yDiOpp = yDi-yOffs; + + if ((edgeFlags & transformEdgeMask) && + (img->get_nonzero_coefficient(xDi ,yDi) || + img->get_nonzero_coefficient(xDiOpp,yDiOpp))) { + bS = 1; + } + else { + + bS = 0; + + const PBMotion& mviP = img->get_mv_info(xDiOpp,yDiOpp); + const PBMotion& mviQ = img->get_mv_info(xDi ,yDi); + + slice_segment_header* shdrP = img->get_SliceHeader(xDiOpp,yDiOpp); + slice_segment_header* shdrQ = img->get_SliceHeader(xDi ,yDi); + + int refPicP0 = mviP.predFlag[0] ? shdrP->RefPicList[0][ mviP.refIdx[0] ] : -1; + int refPicP1 = mviP.predFlag[1] ? shdrP->RefPicList[1][ mviP.refIdx[1] ] : -1; + int refPicQ0 = mviQ.predFlag[0] ? shdrQ->RefPicList[0][ mviQ.refIdx[0] ] : -1; + int refPicQ1 = mviQ.predFlag[1] ? shdrQ->RefPicList[1][ mviQ.refIdx[1] ] : -1; + + bool samePics = ((refPicP0==refPicQ0 && refPicP1==refPicQ1) || + (refPicP0==refPicQ1 && refPicP1==refPicQ0)); + + if (!samePics) { + bS = 1; + } + else { + MotionVector mvP0 = mviP.mv[0]; if (!mviP.predFlag[0]) { mvP0.x=mvP0.y=0; } + MotionVector mvP1 = mviP.mv[1]; if (!mviP.predFlag[1]) { mvP1.x=mvP1.y=0; } + MotionVector mvQ0 = mviQ.mv[0]; if (!mviQ.predFlag[0]) { mvQ0.x=mvQ0.y=0; } + MotionVector mvQ1 = mviQ.mv[1]; if (!mviQ.predFlag[1]) { mvQ1.x=mvQ1.y=0; } + + int numMV_P = mviP.predFlag[0] + mviP.predFlag[1]; + int numMV_Q = mviQ.predFlag[0] + mviQ.predFlag[1]; + + if (numMV_P!=numMV_Q) { + img->decctx->add_warning(DE265_WARNING_NUMMVP_NOT_EQUAL_TO_NUMMVQ, false); + img->integrity = INTEGRITY_DECODING_ERRORS; + } + + // two different reference pictures or only one reference picture + if (refPicP0 != refPicP1) { + + if (refPicP0 == refPicQ0) { + if (abs_value(mvP0.x-mvQ0.x) >= 4 || + abs_value(mvP0.y-mvQ0.y) >= 4 || + abs_value(mvP1.x-mvQ1.x) >= 4 || + abs_value(mvP1.y-mvQ1.y) >= 4) { + bS = 1; + } + } + else { + if (abs_value(mvP0.x-mvQ1.x) >= 4 || + abs_value(mvP0.y-mvQ1.y) >= 4 || + abs_value(mvP1.x-mvQ0.x) >= 4 || + abs_value(mvP1.y-mvQ0.y) >= 4) { + bS = 1; + } + } + } + else { + assert(refPicQ0==refPicQ1); + + if ((abs_value(mvP0.x-mvQ0.x) >= 4 || + abs_value(mvP0.y-mvQ0.y) >= 4 || + abs_value(mvP1.x-mvQ1.x) >= 4 || + abs_value(mvP1.y-mvQ1.y) >= 4) + && + (abs_value(mvP0.x-mvQ1.x) >= 4 || + abs_value(mvP0.y-mvQ1.y) >= 4 || + abs_value(mvP1.x-mvQ0.x) >= 4 || + abs_value(mvP1.y-mvQ0.y) >= 4)) { + bS = 1; + } + } + } + + /* + printf("unimplemented deblocking code for CU at %d;%d\n",xDi,yDi); + + logerror(LogDeblock, "unimplemented code reached (file %s, line %d)\n", + __FILE__, __LINE__); + */ + } + } + + img->set_deblk_bS(xDi,yDi, bS); + } + else { + img->set_deblk_bS(xDi,yDi, 0); + } + } +} + + +void derive_boundaryStrength_CTB(de265_image* img, bool vertical, int xCtb,int yCtb) +{ + int ctbSize = img->get_sps().CtbSizeY; + int deblkSize = ctbSize/4; + + derive_boundaryStrength(img,vertical, + yCtb*deblkSize, (yCtb+1)*deblkSize, + xCtb*deblkSize, (xCtb+1)*deblkSize); +} + + +static uint8_t table_8_23_beta[52] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 7, 8, + 9,10,11,12,13,14,15,16,17,18,20,22,24,26,28,30,32,34,36, + 38,40,42,44,46,48,50,52,54,56,58,60,62,64 +}; + +static uint8_t table_8_23_tc[54] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, + 5, 5, 6, 6, 7, 8, 9,10,11,13,14,16,18,20,22,24 +}; + + + +// 8.7.2.4 +template +void edge_filtering_luma_internal(de265_image* img, bool vertical, + int yStart,int yEnd, int xStart,int xEnd) +{ + //printf("luma %d-%d %d-%d\n",xStart,xEnd,yStart,yEnd); + + const seq_parameter_set& sps = img->get_sps(); + + int xIncr = vertical ? 2 : 1; + int yIncr = vertical ? 1 : 2; + + const int stride = img->get_image_stride(0); + + int bitDepth_Y = sps.BitDepth_Y; + + xEnd = libde265_min(xEnd,img->get_deblk_width()); + yEnd = libde265_min(yEnd,img->get_deblk_height()); + + for (int y=yStart;y pixel resolution + int yDi = y<<2; // *4 -> pixel resolution + int bS = img->get_deblk_bS(xDi,yDi); + + //printf("x,y:%d,%d xDi,yDi:%d,%d\n",x,y,xDi,yDi); + + logtrace(LogDeblock,"deblock POC=%d %c --- x:%d y:%d bS:%d---\n", + img->PicOrderCntVal,vertical ? 'V':'H',xDi,yDi,bS); + +#if 0 + { + uint8_t* ptr = img->y + stride*yDi + xDi; + + for (int dy=-4;dy<4;dy++) { + for (int dx=-4;dx<4;dx++) { + printf("%02x ", ptr[dy*stride + dx]); + if (dx==-1) printf("| "); + } + printf("\n"); + if (dy==-1) printf("-------------------------\n"); + } + } +#endif + +#if 0 + if (!vertical) + { + uint8_t* ptr = img->y + stride*yDi + xDi; + + for (int dy=-4;dy<4;dy++) { + for (int dx=0;dx<4;dx++) { + printf("%02x ", ptr[dy*stride + dx]); + if (dx==-1) printf("| "); + } + printf("\n"); + if (dy==-1) printf("-------------------------\n"); + } + } +#endif + + if (bS>0) { + + // 8.7.2.4.3 + + pixel_t* ptr = img->get_image_plane_at_pos_NEW(0, xDi,yDi); + + pixel_t q[4][4], p[4][4]; + for (int k=0;k<4;k++) + for (int i=0;i<4;i++) + { + if (vertical) { + q[k][i] = ptr[ i +k*stride]; + p[k][i] = ptr[-i-1+k*stride]; + } + else { + q[k][i] = ptr[k + i *stride]; + p[k][i] = ptr[k -(i+1)*stride]; + } + } + +#if 0 + for (int k=0;k<4;k++) + { + for (int i=0;i<4;i++) + { + printf("%02x ", p[k][3-i]); + } + + printf("| "); + + for (int i=0;i<4;i++) + { + printf("%02x ", q[k][i]); + } + printf("\n"); + } +#endif + + + int QP_Q = img->get_QPY(xDi,yDi); + int QP_P = (vertical ? + img->get_QPY(xDi-1,yDi) : + img->get_QPY(xDi,yDi-1) ); + int qP_L = (QP_Q+QP_P+1)>>1; + + logtrace(LogDeblock,"QP: %d & %d -> %d\n",QP_Q,QP_P,qP_L); + + int sliceIndexQ00 = img->get_SliceHeaderIndex(xDi,yDi); + int beta_offset = img->slices[sliceIndexQ00]->slice_beta_offset; + int tc_offset = img->slices[sliceIndexQ00]->slice_tc_offset; + + int Q_beta = Clip3(0,51, qP_L + beta_offset); + int betaPrime = table_8_23_beta[Q_beta]; + int beta = betaPrime * (1<<(bitDepth_Y - 8)); + + int Q_tc = Clip3(0,53, qP_L + 2*(bS-1) + tc_offset); + int tcPrime = table_8_23_tc[Q_tc]; + int tc = tcPrime * (1<<(bitDepth_Y - 8)); + + logtrace(LogDeblock,"beta: %d (%d) tc: %d (%d)\n",beta,beta_offset, tc,tc_offset); + + int dE=0, dEp=0, dEq=0; + + if (vertical || !vertical) { + int dp0 = abs_value(p[0][2] - 2*p[0][1] + p[0][0]); + int dp3 = abs_value(p[3][2] - 2*p[3][1] + p[3][0]); + int dq0 = abs_value(q[0][2] - 2*q[0][1] + q[0][0]); + int dq3 = abs_value(q[3][2] - 2*q[3][1] + q[3][0]); + + int dpq0 = dp0 + dq0; + int dpq3 = dp3 + dq3; + + int dp = dp0 + dp3; + int dq = dq0 + dq3; + int d = dpq0+ dpq3; + + if (d>2) && + abs_value(p[0][3]-p[0][0])+abs_value(q[0][0]-q[0][3]) < (beta>>3) && + abs_value(p[0][0]-q[0][0]) < ((5*tc+1)>>1)); + + bool dSam3 = (2*dpq3 < (beta>>2) && + abs_value(p[3][3]-p[3][0])+abs_value(q[3][0]-q[3][3]) < (beta>>3) && + abs_value(p[3][0]-q[3][0]) < ((5*tc+1)>>1)); + + if (dSam0 && dSam3) { + dE=2; + } + else { + dE=1; + } + + if (dp < ((beta + (beta>>1))>>3)) { dEp=1; } + if (dq < ((beta + (beta>>1))>>3)) { dEq=1; } + + logtrace(LogDeblock,"dE:%d dEp:%d dEq:%d\n",dE,dEp,dEq); + } + } + else { + // TODO + assert(0); + } + + + // 8.7.2.4.4 + + if (dE != 0) { + bool filterP = true; + bool filterQ = true; + + if (vertical) { + if (sps.pcm_loop_filter_disable_flag && img->get_pcm_flag(xDi-1,yDi)) filterP=false; + if (img->get_cu_transquant_bypass(xDi-1,yDi)) filterP=false; + + if (sps.pcm_loop_filter_disable_flag && img->get_pcm_flag(xDi,yDi)) filterQ=false; + if (img->get_cu_transquant_bypass(xDi,yDi)) filterQ=false; + } + else { + if (sps.pcm_loop_filter_disable_flag && img->get_pcm_flag(xDi,yDi-1)) filterP=false; + if (img->get_cu_transquant_bypass(xDi,yDi-1)) filterP=false; + + if (sps.pcm_loop_filter_disable_flag && img->get_pcm_flag(xDi,yDi)) filterQ=false; + if (img->get_cu_transquant_bypass(xDi,yDi)) filterQ=false; + } + + for (int k=0;k<4;k++) { + //int nDp,nDq; + + logtrace(LogDeblock,"line:%d\n",k); + + const pixel_t p0 = p[k][0]; + const pixel_t p1 = p[k][1]; + const pixel_t p2 = p[k][2]; + const pixel_t p3 = p[k][3]; + const pixel_t q0 = q[k][0]; + const pixel_t q1 = q[k][1]; + const pixel_t q2 = q[k][2]; + const pixel_t q3 = q[k][3]; + + if (dE==2) { + // strong filtering + + //nDp=nDq=3; + + pixel_t pnew[3],qnew[3]; + pnew[0] = Clip3(p0-2*tc,p0+2*tc, (p2 + 2*p1 + 2*p0 + 2*q0 + q1 +4)>>3); + pnew[1] = Clip3(p1-2*tc,p1+2*tc, (p2 + p1 + p0 + q0+2)>>2); + pnew[2] = Clip3(p2-2*tc,p2+2*tc, (2*p3 + 3*p2 + p1 + p0 + q0 + 4)>>3); + qnew[0] = Clip3(q0-2*tc,q0+2*tc, (p1+2*p0+2*q0+2*q1+q2+4)>>3); + qnew[1] = Clip3(q1-2*tc,q1+2*tc, (p0+q0+q1+q2+2)>>2); + qnew[2] = Clip3(q2-2*tc,q2+2*tc, (p0+q0+q1+3*q2+2*q3+4)>>3); + + logtrace(LogDeblock,"strong filtering\n"); + + if (vertical) { + for (int i=0;i<3;i++) { + if (filterP) { ptr[-i-1+k*stride] = pnew[i]; } + if (filterQ) { ptr[ i + k*stride] = qnew[i]; } + } + + // ptr[-1+k*stride] = ptr[ 0+k*stride] = 200; + } + else { + for (int i=0;i<3;i++) { + if (filterP) { ptr[ k -(i+1)*stride] = pnew[i]; } + if (filterQ) { ptr[ k + i *stride] = qnew[i]; } + } + } + } + else { + // weak filtering + + //nDp=nDq=0; + + int delta = (9*(q0-p0) - 3*(q1-p1) + 8)>>4; + logtrace(LogDeblock,"delta=%d, tc=%d\n",delta,tc); + + if (abs_value(delta) < tc*10) { + + delta = Clip3(-tc,tc,delta); + logtrace(LogDeblock," deblk + %d;%d [%02x->%02x] - %d;%d [%02x->%02x] delta:%d\n", + vertical ? xDi-1 : xDi+k, + vertical ? yDi+k : yDi-1, p0,Clip_BitDepth(p0+delta, bitDepth_Y), + vertical ? xDi : xDi+k, + vertical ? yDi+k : yDi, q0,Clip_BitDepth(q0-delta, bitDepth_Y), + delta); + + if (vertical) { + if (filterP) { ptr[-0-1+k*stride] = Clip_BitDepth(p0+delta, bitDepth_Y); } + if (filterQ) { ptr[ 0 +k*stride] = Clip_BitDepth(q0-delta, bitDepth_Y); } + } + else { + if (filterP) { ptr[ k -1*stride] = Clip_BitDepth(p0+delta, bitDepth_Y); } + if (filterQ) { ptr[ k +0*stride] = Clip_BitDepth(q0-delta, bitDepth_Y); } + } + + //ptr[ 0+k*stride] = 200; + + if (dEp==1 && filterP) { + int delta_p = Clip3(-(tc>>1), tc>>1, (((p2+p0+1)>>1)-p1+delta)>>1); + + logtrace(LogDeblock," deblk dEp %d;%d delta:%d\n", + vertical ? xDi-2 : xDi+k, + vertical ? yDi+k : yDi-2, + delta_p); + + if (vertical) { ptr[-1-1+k*stride] = Clip_BitDepth(p1+delta_p, bitDepth_Y); } + else { ptr[ k -2*stride] = Clip_BitDepth(p1+delta_p, bitDepth_Y); } + } + + if (dEq==1 && filterQ) { + int delta_q = Clip3(-(tc>>1), tc>>1, (((q2+q0+1)>>1)-q1-delta)>>1); + + logtrace(LogDeblock," delkb dEq %d;%d delta:%d\n", + vertical ? xDi+1 : xDi+k, + vertical ? yDi+k : yDi+1, + delta_q); + + if (vertical) { ptr[ 1 +k*stride] = Clip_BitDepth(q1+delta_q, bitDepth_Y); } + else { ptr[ k +1*stride] = Clip_BitDepth(q1+delta_q, bitDepth_Y); } + } + + //nDp = dEp+1; + //nDq = dEq+1; + + //logtrace(LogDeblock,"weak filtering (%d:%d)\n",nDp,nDq); + } + } + } + } + } + } +} + + +void edge_filtering_luma(de265_image* img, bool vertical, + int yStart,int yEnd, int xStart,int xEnd) +{ + if (img->high_bit_depth(0)) { + edge_filtering_luma_internal(img,vertical,yStart,yEnd,xStart,xEnd); + } + else { + edge_filtering_luma_internal(img,vertical,yStart,yEnd,xStart,xEnd); + } +} + +void edge_filtering_luma_CTB(de265_image* img, bool vertical, int xCtb,int yCtb) +{ + int ctbSize = img->get_sps().CtbSizeY; + int deblkSize = ctbSize/4; + + edge_filtering_luma(img,vertical, + yCtb*deblkSize, (yCtb+1)*deblkSize, + xCtb*deblkSize, (xCtb+1)*deblkSize); +} + + + + +// 8.7.2.4 +/** ?Start and ?End values in 4-luma pixels resolution. + */ +template +void edge_filtering_chroma_internal(de265_image* img, bool vertical, + int yStart,int yEnd, + int xStart,int xEnd) +{ + //printf("chroma %d-%d %d-%d\n",xStart,xEnd,yStart,yEnd); + + const seq_parameter_set& sps = img->get_sps(); + + const int SubWidthC = sps.SubWidthC; + const int SubHeightC = sps.SubHeightC; + + int xIncr = vertical ? 2 : 1; + int yIncr = vertical ? 1 : 2; + + xIncr *= SubWidthC; + yIncr *= SubHeightC; + + const int stride = img->get_image_stride(1); + + xEnd = libde265_min(xEnd,img->get_deblk_width()); + yEnd = libde265_min(yEnd,img->get_deblk_height()); + + int bitDepth_C = sps.BitDepth_C; + + for (int y=yStart;yget_deblk_bS(xDi*SubWidthC,yDi*SubHeightC); + + if (bS>1) { + // 8.7.2.4.5 + + for (int cplane=0;cplane<2;cplane++) { + int cQpPicOffset = (cplane==0 ? + img->get_pps().pic_cb_qp_offset : + img->get_pps().pic_cr_qp_offset); + + pixel_t* ptr = img->get_image_plane_at_pos_NEW(cplane+1, xDi,yDi); + + pixel_t p[2][4]; + pixel_t q[2][4]; + + logtrace(LogDeblock,"-%s- %d %d\n",cplane==0 ? "Cb" : "Cr",xDi,yDi); + + for (int i=0;i<2;i++) + for (int k=0;k<4;k++) + { + if (vertical) { + q[i][k] = ptr[ i +k*stride]; + p[i][k] = ptr[-i-1+k*stride]; + } + else { + q[i][k] = ptr[k + i *stride]; + p[i][k] = ptr[k -(i+1)*stride]; + } + } + +#if 0 + for (int k=0;k<4;k++) + { + for (int i=0;i<2;i++) + { + printf("%02x ", p[1-i][k]); + } + + printf("| "); + + for (int i=0;i<2;i++) + { + printf("%02x ", q[i][k]); + } + printf("\n"); + } +#endif + + int QP_Q = img->get_QPY(SubWidthC*xDi,SubHeightC*yDi); + int QP_P = (vertical ? + img->get_QPY(SubWidthC*xDi-1,SubHeightC*yDi) : + img->get_QPY(SubWidthC*xDi,SubHeightC*yDi-1)); + int qP_i = ((QP_Q+QP_P+1)>>1) + cQpPicOffset; + int QP_C; + if (sps.ChromaArrayType == CHROMA_420) { + QP_C = table8_22(qP_i); + } else { + QP_C = libde265_min(qP_i, 51); + } + + + //printf("POC=%d\n",ctx->img->PicOrderCntVal); + logtrace(LogDeblock,"%d %d: ((%d+%d+1)>>1) + %d = qP_i=%d (QP_C=%d)\n", + SubWidthC*xDi,SubHeightC*yDi, QP_Q,QP_P,cQpPicOffset,qP_i,QP_C); + + int sliceIndexQ00 = img->get_SliceHeaderIndex(SubWidthC*xDi,SubHeightC*yDi); + int tc_offset = img->slices[sliceIndexQ00]->slice_tc_offset; + + int Q = Clip3(0,53, QP_C + 2*(bS-1) + tc_offset); + + int tcPrime = table_8_23_tc[Q]; + int tc = tcPrime * (1<<(sps.BitDepth_C - 8)); + + logtrace(LogDeblock,"tc_offset=%d Q=%d tc'=%d tc=%d\n",tc_offset,Q,tcPrime,tc); + + if (vertical) { + bool filterP = true; + if (sps.pcm_loop_filter_disable_flag && img->get_pcm_flag(SubWidthC*xDi-1,SubHeightC*yDi)) filterP=false; + if (img->get_cu_transquant_bypass(SubWidthC*xDi-1,SubHeightC*yDi)) filterP=false; + + bool filterQ = true; + if (sps.pcm_loop_filter_disable_flag && img->get_pcm_flag(SubWidthC*xDi,SubHeightC*yDi)) filterQ=false; + if (img->get_cu_transquant_bypass(SubWidthC*xDi,SubHeightC*yDi)) filterQ=false; + + + for (int k=0;k<4;k++) { + int delta = Clip3(-tc,tc, ((((q[0][k]-p[0][k])<<2)+p[1][k]-q[1][k]+4)>>3)); + logtrace(LogDeblock,"delta=%d\n",delta); + if (filterP) { ptr[-1+k*stride] = Clip_BitDepth(p[0][k]+delta, bitDepth_C); } + if (filterQ) { ptr[ 0+k*stride] = Clip_BitDepth(q[0][k]-delta, bitDepth_C); } + } + } + else { + bool filterP = true; + if (sps.pcm_loop_filter_disable_flag && img->get_pcm_flag(SubWidthC*xDi,SubHeightC*yDi-1)) filterP=false; + if (img->get_cu_transquant_bypass(SubWidthC*xDi,SubHeightC*yDi-1)) filterP=false; + + bool filterQ = true; + if (sps.pcm_loop_filter_disable_flag && img->get_pcm_flag(SubWidthC*xDi,SubHeightC*yDi)) filterQ=false; + if (img->get_cu_transquant_bypass(SubWidthC*xDi,SubHeightC*yDi)) filterQ=false; + + for (int k=0;k<4;k++) { + int delta = Clip3(-tc,tc, ((((q[0][k]-p[0][k])<<2)+p[1][k]-q[1][k]+4)>>3)); + if (filterP) { ptr[ k-1*stride] = Clip_BitDepth(p[0][k]+delta, bitDepth_C); } + if (filterQ) { ptr[ k+0*stride] = Clip_BitDepth(q[0][k]-delta, bitDepth_C); } + } + } + } + } + } +} + + +void edge_filtering_chroma(de265_image* img, bool vertical, int yStart,int yEnd, + int xStart,int xEnd) +{ + if (img->high_bit_depth(1)) { + edge_filtering_chroma_internal(img,vertical,yStart,yEnd,xStart,xEnd); + } + else { + edge_filtering_chroma_internal(img,vertical,yStart,yEnd,xStart,xEnd); + } +} + + +void edge_filtering_chroma_CTB(de265_image* img, bool vertical, int xCtb,int yCtb) +{ + int ctbSize = img->get_sps().CtbSizeY; + int deblkSize = ctbSize/4; + + edge_filtering_chroma(img,vertical, + yCtb*deblkSize, (yCtb+1)*deblkSize, + xCtb*deblkSize, (xCtb+1)*deblkSize); +} + + + +class thread_task_deblock_CTBRow : public thread_task +{ +public: + struct de265_image* img; + int ctb_y; + bool vertical; + + virtual void work(); + virtual std::string name() const { + char buf[100]; + sprintf(buf,"deblock-%d",ctb_y); + return buf; + } +}; + + +void thread_task_deblock_CTBRow::work() +{ + state = Running; + img->thread_run(this); + + int xStart=0; + int xEnd = img->get_deblk_width(); + + int ctbSize = img->get_sps().CtbSizeY; + int deblkSize = ctbSize/4; + + int first = ctb_y * deblkSize; + int last = (ctb_y+1) * deblkSize; + if (last > img->get_deblk_height()) { + last = img->get_deblk_height(); + } + + int finalProgress = CTB_PROGRESS_DEBLK_V; + if (!vertical) finalProgress = CTB_PROGRESS_DEBLK_H; + + int rightCtb = img->get_sps().PicWidthInCtbsY-1; + + if (vertical) { + // pass 1: vertical + + int CtbRow = std::min(ctb_y+1 , img->get_sps().PicHeightInCtbsY-1); + img->wait_for_progress(this, rightCtb,CtbRow, CTB_PROGRESS_PREFILTER); + } + else { + // pass 2: horizontal + + if (ctb_y>0) { + img->wait_for_progress(this, rightCtb,ctb_y-1, CTB_PROGRESS_DEBLK_V); + } + + img->wait_for_progress(this, rightCtb,ctb_y, CTB_PROGRESS_DEBLK_V); + + if (ctb_y+1get_sps().PicHeightInCtbsY) { + img->wait_for_progress(this, rightCtb,ctb_y+1, CTB_PROGRESS_DEBLK_V); + } + } + + //printf("deblock %d to %d orientation: %d\n",first,last,vertical); + + bool deblocking_enabled; + + // first pass: check edge flags and whether we have to deblock + if (vertical) { + deblocking_enabled = derive_edgeFlags_CTBRow(img, ctb_y); + + //for (int x=0;x<=rightCtb;x++) { + int x=0; img->set_CtbDeblockFlag(x,ctb_y, deblocking_enabled); + //} + } + else { + int x=0; deblocking_enabled=img->get_CtbDeblockFlag(x,ctb_y); + } + + if (deblocking_enabled) { + derive_boundaryStrength(img, vertical, first,last, xStart,xEnd); + + edge_filtering_luma(img, vertical, first,last, xStart,xEnd); + + if (img->get_sps().ChromaArrayType != CHROMA_MONO) { + edge_filtering_chroma(img, vertical, first,last, xStart,xEnd); + } + } + + for (int x=0;x<=rightCtb;x++) { + const int CtbWidth = img->get_sps().PicWidthInCtbsY; + img->ctb_progress[x+ctb_y*CtbWidth].set_progress(finalProgress); + } + + state = Finished; + img->thread_finishes(this); +} + + +void add_deblocking_tasks(image_unit* imgunit) +{ + de265_image* img = imgunit->img; + decoder_context* ctx = img->decctx; + + int nRows = img->get_sps().PicHeightInCtbsY; + + int n=0; + img->thread_start(nRows*2); + + for (int pass=0;pass<2;pass++) + { + for (int y=0;yget_sps().PicHeightInCtbsY;y++) + { + thread_task_deblock_CTBRow* task = new thread_task_deblock_CTBRow; + + task->img = img; + task->ctb_y = y; + task->vertical = (pass==0); + + imgunit->tasks.push_back(task); + add_task(&ctx->thread_pool_, task); + n++; + } + } +} + + +void apply_deblocking_filter(de265_image* img) // decoder_context* ctx) +{ + decoder_context* ctx = img->decctx; + + char enabled_deblocking = derive_edgeFlags(img); + + if (enabled_deblocking) + { + // vertical filtering + + logtrace(LogDeblock,"VERTICAL\n"); + derive_boundaryStrength(img, true ,0,img->get_deblk_height(),0,img->get_deblk_width()); + edge_filtering_luma (img, true ,0,img->get_deblk_height(),0,img->get_deblk_width()); + + if (img->get_sps().ChromaArrayType != CHROMA_MONO) { + edge_filtering_chroma (img, true ,0,img->get_deblk_height(),0,img->get_deblk_width()); + } +#if 0 + char buf[1000]; + sprintf(buf,"lf-after-V-%05d.yuv", ctx->img->PicOrderCntVal); + write_picture_to_file(ctx->img, buf); +#endif + + // horizontal filtering + + logtrace(LogDeblock,"HORIZONTAL\n"); + derive_boundaryStrength(img, false ,0,img->get_deblk_height(),0,img->get_deblk_width()); + edge_filtering_luma (img, false ,0,img->get_deblk_height(),0,img->get_deblk_width()); + + if (img->get_sps().ChromaArrayType != CHROMA_MONO) { + edge_filtering_chroma (img, false ,0,img->get_deblk_height(),0,img->get_deblk_width()); + } + +#if 0 + sprintf(buf,"lf-after-H-%05d.yuv", ctx->img->PicOrderCntVal); + write_picture_to_file(ctx->img, buf); +#endif + } +} diff --git a/decctx.cc b/decctx.cc new file mode 100644 index 0000000..edebb71 --- /dev/null +++ b/decctx.cc @@ -0,0 +1,2285 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "decctx.h" +#include "util.h" +#include "sao.h" +#include "sei.h" +#include "deblock.h" + +#include +#include +#include +#include +#include + +#include "fallback.h" + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#ifdef HAVE_SSE4_1 +#include "x86/sse.h" +#endif + +#ifdef HAVE_ARM +#include "arm/arm.h" +#endif + +#define SAVE_INTERMEDIATE_IMAGES 0 + +#if SAVE_INTERMEDIATE_IMAGES +#include "visualize.h" +#endif + +extern void thread_decode_CTB_row(void* d); +extern void thread_decode_slice_segment(void* d); + + +thread_context::thread_context() +{ + /* + CtbAddrInRS = 0; + CtbAddrInTS = 0; + + CtbX = 0; + CtbY = 0; + */ + + /* + refIdx[0] = refIdx[1] = 0; + mvd[0][0] = mvd[0][1] = mvd[1][0] = mvd[1][1] = 0; + merge_flag = 0; + merge_idx = 0; + mvp_lX_flag[0] = mvp_lX_flag[1] = 0; + inter_pred_idc = 0; + */ + + /* + enum IntraPredMode IntraPredModeC; // chroma intra-prediction mode for current CB + */ + + /* + cu_transquant_bypass_flag = false; + memset(transform_skip_flag,0, 3*sizeof(uint8_t)); + */ + + + //memset(coeffList,0,sizeof(int16_t)*3*32*32); + //memset(coeffPos,0,sizeof(int16_t)*3*32*32); + //memset(nCoeff,0,sizeof(int16_t)*3); + + + + IsCuQpDeltaCoded = false; + CuQpDelta = 0; + + IsCuChromaQpOffsetCoded = false; + CuQpOffsetCb = 0; + CuQpOffsetCr = 0; + + /* + currentQPY = 0; + currentQG_x = 0; + currentQG_y = 0; + lastQPYinPreviousQG = 0; + */ + + /* + qPYPrime = 0; + qPCbPrime = 0; + qPCrPrime = 0; + */ + + /* + memset(&cabac_decoder, 0, sizeof(CABAC_decoder)); + memset(&ctx_model, 0, sizeof(ctx_model)); + */ + + decctx = NULL; + img = NULL; + shdr = NULL; + + imgunit = NULL; + sliceunit = NULL; + + + //memset(this,0,sizeof(thread_context)); + + // There is a interesting issue here. When aligning _coeffBuf to 16 bytes offset with + // __attribute__((align(16))), the following statement is optimized away since the + // compiler assumes that the pointer would be 16-byte aligned. However, this is not the + // case when the structure has been dynamically allocated. In this case, the base can + // also be at 8 byte offsets (at least with MingW,32 bit). + int offset = ((uintptr_t)_coeffBuf) & 0xf; + + if (offset == 0) { + coeffBuf = _coeffBuf; // correctly aligned already + } + else { + coeffBuf = (int16_t *) (((uint8_t *)_coeffBuf) + (16-offset)); + } + + memset(coeffBuf, 0, 32*32*sizeof(int16_t)); +} + + +slice_unit::slice_unit(decoder_context* decctx) + : nal(NULL), + shdr(NULL), + imgunit(NULL), + flush_reorder_buffer(false), + nThreads(0), + first_decoded_CTB_RS(-1), + last_decoded_CTB_RS(-1), + thread_contexts(NULL), + ctx(decctx) +{ + state = Unprocessed; + nThreadContexts = 0; +} + +slice_unit::~slice_unit() +{ + ctx->nal_parser.free_NAL_unit(nal); + + if (thread_contexts) { + delete[] thread_contexts; + } +} + + +void slice_unit::allocate_thread_contexts(int n) +{ + assert(thread_contexts==NULL); + + thread_contexts = new thread_context[n]; + nThreadContexts = n; +} + + +image_unit::image_unit() +{ + img=NULL; + role=Invalid; + state=Unprocessed; +} + + +image_unit::~image_unit() +{ + for (int i=0;iFirstAfterEndOfSequenceNAL = true; + //ctx->last_RAP_picture_NAL_type = NAL_UNIT_UNDEFINED; + + //de265_init_image(&ctx->coeff); + + // --- decoded picture buffer --- + + current_image_poc_lsb = -1; // any invalid number +} + + +decoder_context::~decoder_context() +{ + while (!image_units.empty()) { + delete image_units.back(); + image_units.pop_back(); + } +} + + +void decoder_context::set_image_allocation_functions(de265_image_allocation* allocfunc, + void* userdata) +{ + if (allocfunc) { + param_image_allocation_functions = *allocfunc; + param_image_allocation_userdata = userdata; + } + else { + assert(false); // actually, it makes no sense to reset the allocation functions + + param_image_allocation_functions = de265_image::default_image_allocation; + param_image_allocation_userdata = NULL; + } +} + + +de265_error decoder_context::start_thread_pool(int nThreads) +{ + ::start_thread_pool(&thread_pool_, nThreads); + + num_worker_threads = nThreads; + + return DE265_OK; +} + + +void decoder_context::stop_thread_pool() +{ + if (get_num_worker_threads()>0) { + //flush_thread_pool(&ctx->thread_pool); + ::stop_thread_pool(&thread_pool_); + } +} + + +void decoder_context::reset() +{ + if (num_worker_threads>0) { + //flush_thread_pool(&ctx->thread_pool); + ::stop_thread_pool(&thread_pool_); + } + + // -------------------------------------------------- + +#if 0 + ctx->end_of_stream = false; + ctx->pending_input_NAL = NULL; + ctx->current_vps = NULL; + ctx->current_sps = NULL; + ctx->current_pps = NULL; + ctx->num_worker_threads = 0; + ctx->current_image_poc_lsb = 0; + ctx->first_decoded_picture = 0; + ctx->NoRaslOutputFlag = 0; + ctx->HandleCraAsBlaFlag = 0; + ctx->FirstAfterEndOfSequenceNAL = 0; + ctx->PicOrderCntMsb = 0; + ctx->prevPicOrderCntLsb = 0; + ctx->prevPicOrderCntMsb = 0; + ctx->NumPocStCurrBefore=0; + ctx->NumPocStCurrAfter=0; + ctx->NumPocStFoll=0; + ctx->NumPocLtCurr=0; + ctx->NumPocLtFoll=0; + ctx->nal_unit_type=0; + ctx->IdrPicFlag=0; + ctx->RapPicFlag=0; +#endif + + img = NULL; + + + // TODO: remove all pending image_units + + + // --- decoded picture buffer --- + + current_image_poc_lsb = -1; // any invalid number + first_decoded_picture = true; + + + // --- remove all pictures from output queue --- + + // there was a bug the peek_next_image did not return NULL on empty output queues. + // This was (indirectly) fixed by recreating the DPB buffer, but it should actually + // be sufficient to clear it like this. + // The error showed while scrubbing the ToS video in VLC. + dpb.clear(); + + nal_parser.remove_pending_input_data(); + + + while (!image_units.empty()) { + delete image_units.back(); + image_units.pop_back(); + } + + // --- start threads again --- + + if (num_worker_threads>0) { + // TODO: need error checking + start_thread_pool(num_worker_threads); + } +} + +void base_context::set_acceleration_functions(enum de265_acceleration l) +{ + // fill scalar functions first (so that function table is completely filled) + + init_acceleration_functions_fallback(&acceleration); + + + // override functions with optimized variants + +#ifdef HAVE_SSE4_1 + if (l>=de265_acceleration_SSE) { + init_acceleration_functions_sse(&acceleration); + } +#endif +#ifdef HAVE_ARM + if (l>=de265_acceleration_ARM) { + init_acceleration_functions_arm(&acceleration); + } +#endif +} + + +void decoder_context::init_thread_context(thread_context* tctx) +{ + // zero scrap memory for coefficient blocks + memset(tctx->_coeffBuf, 0, sizeof(tctx->_coeffBuf)); // TODO: check if we can safely remove this + + tctx->currentQG_x = -1; + tctx->currentQG_y = -1; + + + + // --- find QPY that was active at the end of the previous slice --- + + // find the previous CTB in TS order + + const pic_parameter_set& pps = tctx->img->get_pps(); + const seq_parameter_set& sps = tctx->img->get_sps(); + + + if (tctx->shdr->slice_segment_address > 0) { + int prevCtb = pps.CtbAddrTStoRS[ pps.CtbAddrRStoTS[tctx->shdr->slice_segment_address] -1 ]; + + int ctbX = prevCtb % sps.PicWidthInCtbsY; + int ctbY = prevCtb / sps.PicWidthInCtbsY; + + + // take the pixel at the bottom right corner (but consider that the image size might be smaller) + + int x = ((ctbX+1) << sps.Log2CtbSizeY)-1; + int y = ((ctbY+1) << sps.Log2CtbSizeY)-1; + + x = std::min(x,sps.pic_width_in_luma_samples-1); + y = std::min(y,sps.pic_height_in_luma_samples-1); + + //printf("READ QPY: %d %d -> %d (should %d)\n",x,y,imgunit->img->get_QPY(x,y), tc.currentQPY); + + //if (tctx->shdr->dependent_slice_segment_flag) { // TODO: do we need this condition ? + tctx->currentQPY = tctx->img->get_QPY(x,y); + //} + } +} + + +void decoder_context::add_task_decode_CTB_row(thread_context* tctx, + bool firstSliceSubstream, + int ctbRow) +{ + thread_task_ctb_row* task = new thread_task_ctb_row; + task->firstSliceSubstream = firstSliceSubstream; + task->tctx = tctx; + task->debug_startCtbRow = ctbRow; + tctx->task = task; + + add_task(&thread_pool_, task); + + tctx->imgunit->tasks.push_back(task); +} + + +void decoder_context::add_task_decode_slice_segment(thread_context* tctx, bool firstSliceSubstream, + int ctbx,int ctby) +{ + thread_task_slice_segment* task = new thread_task_slice_segment; + task->firstSliceSubstream = firstSliceSubstream; + task->tctx = tctx; + task->debug_startCtbX = ctbx; + task->debug_startCtbY = ctby; + tctx->task = task; + + add_task(&thread_pool_, task); + + tctx->imgunit->tasks.push_back(task); +} + + +de265_error decoder_context::read_vps_NAL(bitreader& reader) +{ + logdebug(LogHeaders,"---> read VPS\n"); + + std::shared_ptr new_vps = std::make_shared(); + de265_error err = new_vps->read(this,&reader); + if (err != DE265_OK) { + return err; + } + + if (param_vps_headers_fd>=0) { + new_vps->dump(param_vps_headers_fd); + } + + vps[ new_vps->video_parameter_set_id ] = new_vps; + + return DE265_OK; +} + +de265_error decoder_context::read_sps_NAL(bitreader& reader) +{ + logdebug(LogHeaders,"----> read SPS\n"); + + std::shared_ptr new_sps = std::make_shared(); + de265_error err; + + if ((err=new_sps->read(this, &reader)) != DE265_OK) { + return err; + } + + if (param_sps_headers_fd>=0) { + new_sps->dump(param_sps_headers_fd); + } + + sps[ new_sps->seq_parameter_set_id ] = new_sps; + + return DE265_OK; +} + +de265_error decoder_context::read_pps_NAL(bitreader& reader) +{ + logdebug(LogHeaders,"----> read PPS\n"); + + std::shared_ptr new_pps = std::make_shared(); + + bool success = new_pps->read(&reader,this); + + if (param_pps_headers_fd>=0) { + new_pps->dump(param_pps_headers_fd); + } + + if (success) { + pps[ (int)new_pps->pic_parameter_set_id ] = new_pps; + } + + return success ? DE265_OK : DE265_WARNING_PPS_HEADER_INVALID; +} + +de265_error decoder_context::read_sei_NAL(bitreader& reader, bool suffix) +{ + logdebug(LogHeaders,"----> read SEI\n"); + + sei_message sei; + + //push_current_picture_to_output_queue(); + + de265_error err = DE265_OK; + + if ((err=read_sei(&reader,&sei, suffix, current_sps.get())) == DE265_OK) { + dump_sei(&sei, current_sps.get()); + + if (image_units.empty()==false && suffix) { + image_units.back()->suffix_SEIs.push_back(sei); + } + } + else { + add_warning(err, false); + } + + return err; +} + +de265_error decoder_context::read_eos_NAL(bitreader& reader) +{ + FirstAfterEndOfSequenceNAL = true; + return DE265_OK; +} + +de265_error decoder_context::read_slice_NAL(bitreader& reader, NAL_unit* nal, nal_header& nal_hdr) +{ + logdebug(LogHeaders,"---> read slice segment header\n"); + + + // --- read slice header --- + + slice_segment_header* shdr = new slice_segment_header; + bool continueDecoding; + de265_error err = shdr->read(&reader,this, &continueDecoding); + if (!continueDecoding) { + if (img) { img->integrity = INTEGRITY_NOT_DECODED; } + nal_parser.free_NAL_unit(nal); + delete shdr; + return err; + } + + if (param_slice_headers_fd>=0) { + shdr->dump_slice_segment_header(this, param_slice_headers_fd); + } + + + if (process_slice_segment_header(shdr, &err, nal->pts, &nal_hdr, nal->user_data) == false) + { + if (img!=NULL) img->integrity = INTEGRITY_NOT_DECODED; + nal_parser.free_NAL_unit(nal); + delete shdr; + return err; + } + + this->img->add_slice_segment_header(shdr); + + skip_bits(&reader,1); // TODO: why? + prepare_for_CABAC(&reader); + + + // modify entry_point_offsets + + int headerLength = reader.data - nal->data(); + for (int i=0;inum_entry_point_offsets;i++) { + shdr->entry_point_offset[i] -= nal->num_skipped_bytes_before(shdr->entry_point_offset[i], + headerLength); + } + + + + // --- start a new image if this is the first slice --- + + if (shdr->first_slice_segment_in_pic_flag) { + image_unit* imgunit = new image_unit; + imgunit->img = this->img; + image_units.push_back(imgunit); + } + + + // --- add slice to current picture --- + + if ( ! image_units.empty() ) { + + slice_unit* sliceunit = new slice_unit(this); + sliceunit->nal = nal; + sliceunit->shdr = shdr; + sliceunit->reader = reader; + + sliceunit->flush_reorder_buffer = flush_reorder_buffer_at_this_frame; + + + image_units.back()->slice_units.push_back(sliceunit); + } + + bool did_work; + err = decode_some(&did_work); + + return DE265_OK; +} + + +template void pop_front(std::vector& vec) +{ + for (int i=1;islice_units.empty() ) { + + image_unit* imgunit = image_units[0]; + slice_unit* sliceunit = imgunit->get_next_unprocessed_slice_segment(); + + if (sliceunit != NULL) { + + //pop_front(imgunit->slice_units); + + if (sliceunit->flush_reorder_buffer) { + dpb.flush_reorder_buffer(); + } + + *did_work = true; + + //err = decode_slice_unit_sequential(imgunit, sliceunit); + err = decode_slice_unit_parallel(imgunit, sliceunit); + if (err) { + return err; + } + + //delete sliceunit; + } + } + + + + // if we decoded all slices of the current image and there will not + // be added any more slices to the image, output the image + + if ( ( image_units.size()>=2 && image_units[0]->all_slice_segments_processed()) || + ( image_units.size()>=1 && image_units[0]->all_slice_segments_processed() && + nal_parser.number_of_NAL_units_pending()==0 && + (nal_parser.is_end_of_stream() || nal_parser.is_end_of_frame()) )) { + + image_unit* imgunit = image_units[0]; + + *did_work=true; + + + // mark all CTBs as decoded even if they are not, because faulty input + // streams could miss part of the picture + // TODO: this will not work when slice decoding is parallel to post-filtering, + // so we will have to replace this with keeping track of which CTB should have + // been decoded (but aren't because of the input stream being faulty) + + imgunit->img->mark_all_CTB_progress(CTB_PROGRESS_PREFILTER); + + + + // run post-processing filters (deblocking & SAO) + + if (img->decctx->num_worker_threads) + run_postprocessing_filters_parallel(imgunit); + else + run_postprocessing_filters_sequential(imgunit->img); + + // process suffix SEIs + + for (int i=0;isuffix_SEIs.size();i++) { + const sei_message& sei = imgunit->suffix_SEIs[i]; + + err = process_sei(&sei, imgunit->img); + if (err != DE265_OK) + break; + } + + + push_picture_to_output_queue(imgunit); + + // remove just decoded image unit from queue + + delete imgunit; + + pop_front(image_units); + } + + return err; +} + + +de265_error decoder_context::decode_slice_unit_sequential(image_unit* imgunit, + slice_unit* sliceunit) +{ + de265_error err = DE265_OK; + + /* + printf("decode slice POC=%d addr=%d, img=%p\n", + sliceunit->shdr->slice_pic_order_cnt_lsb, + sliceunit->shdr->slice_segment_address, + imgunit->img); + */ + + remove_images_from_dpb(sliceunit->shdr->RemoveReferencesList); + + if (sliceunit->shdr->slice_segment_address >= imgunit->img->get_pps().CtbAddrRStoTS.size()) { + return DE265_ERROR_CTB_OUTSIDE_IMAGE_AREA; + } + + + struct thread_context tctx; + + tctx.shdr = sliceunit->shdr; + tctx.img = imgunit->img; + tctx.decctx = this; + tctx.imgunit = imgunit; + tctx.sliceunit= sliceunit; + tctx.CtbAddrInTS = imgunit->img->get_pps().CtbAddrRStoTS[tctx.shdr->slice_segment_address]; + tctx.task = NULL; + + init_thread_context(&tctx); + + if (sliceunit->reader.bytes_remaining <= 0) { + return DE265_ERROR_PREMATURE_END_OF_SLICE; + } + + init_CABAC_decoder(&tctx.cabac_decoder, + sliceunit->reader.data, + sliceunit->reader.bytes_remaining); + + // alloc CABAC-model array if entropy_coding_sync is enabled + + if (imgunit->img->get_pps().entropy_coding_sync_enabled_flag && + sliceunit->shdr->first_slice_segment_in_pic_flag) { + imgunit->ctx_models.resize( (img->get_sps().PicHeightInCtbsY-1) ); //* CONTEXT_MODEL_TABLE_LENGTH ); + } + + sliceunit->nThreads=1; + + err=read_slice_segment_data(&tctx); + + sliceunit->finished_threads.set_progress(1); + + return err; +} + + +void decoder_context::mark_whole_slice_as_processed(image_unit* imgunit, + slice_unit* sliceunit, + int progress) +{ + //printf("mark whole slice\n"); + + + // mark all CTBs upto the next slice segment as processed + + slice_unit* nextSegment = imgunit->get_next_slice_segment(sliceunit); + if (nextSegment) { + /* + printf("mark whole slice between %d and %d\n", + sliceunit->shdr->slice_segment_address, + nextSegment->shdr->slice_segment_address); + */ + + for (int ctb=sliceunit->shdr->slice_segment_address; + ctb < nextSegment->shdr->slice_segment_address; + ctb++) + { + if (ctb >= imgunit->img->number_of_ctbs()) + break; + + imgunit->img->ctb_progress[ctb].set_progress(progress); + } + } +} + + +de265_error decoder_context::decode_slice_unit_parallel(image_unit* imgunit, + slice_unit* sliceunit) +{ + de265_error err = DE265_OK; + + remove_images_from_dpb(sliceunit->shdr->RemoveReferencesList); + + /* + printf("-------- decode --------\n"); + printf("IMAGE UNIT %p\n",imgunit); + sliceunit->shdr->dump_slice_segment_header(sliceunit->ctx, 1); + imgunit->dump_slices(); + */ + + de265_image* img = imgunit->img; + const pic_parameter_set& pps = img->get_pps(); + + sliceunit->state = slice_unit::InProgress; + + bool use_WPP = (img->decctx->num_worker_threads > 0 && + pps.entropy_coding_sync_enabled_flag); + + bool use_tiles = (img->decctx->num_worker_threads > 0 && + pps.tiles_enabled_flag); + + + // TODO: remove this warning later when we do frame-parallel decoding + if (img->decctx->num_worker_threads > 0 && + pps.entropy_coding_sync_enabled_flag == false && + pps.tiles_enabled_flag == false) { + + img->decctx->add_warning(DE265_WARNING_NO_WPP_CANNOT_USE_MULTITHREADING, true); + } + + + // If this is the first slice segment, mark all CTBs before this as processed + // (the real first slice segment could be missing). + + if (imgunit->is_first_slice_segment(sliceunit)) { + slice_segment_header* shdr = sliceunit->shdr; + int firstCTB = shdr->slice_segment_address; + + for (int ctb=0;ctbctb_progress[ctb].set_progress(CTB_PROGRESS_PREFILTER); + } + } + + + // if there is a previous slice that has been completely decoded, + // mark all CTBs until the start of this slice as completed + + //printf("this slice: %p\n",sliceunit); + slice_unit* prevSlice = imgunit->get_prev_slice_segment(sliceunit); + //if (prevSlice) printf("prev slice state: %d\n",prevSlice->state); + if (prevSlice && prevSlice->state == slice_unit::Decoded) { + mark_whole_slice_as_processed(imgunit,prevSlice,CTB_PROGRESS_PREFILTER); + } + + + // TODO: even though we cannot split this into several tasks, we should run it + // as a background thread + if (!use_WPP && !use_tiles) { + //printf("SEQ\n"); + err = decode_slice_unit_sequential(imgunit, sliceunit); + sliceunit->state = slice_unit::Decoded; + mark_whole_slice_as_processed(imgunit,sliceunit,CTB_PROGRESS_PREFILTER); + return err; + } + + + if (use_WPP && use_tiles) { + // TODO: this is not allowed ... output some warning or error + + return DE265_WARNING_PPS_HEADER_INVALID; + } + + + if (use_WPP) { + //printf("WPP\n"); + err = decode_slice_unit_WPP(imgunit, sliceunit); + sliceunit->state = slice_unit::Decoded; + mark_whole_slice_as_processed(imgunit,sliceunit,CTB_PROGRESS_PREFILTER); + return err; + } + else if (use_tiles) { + //printf("TILE\n"); + err = decode_slice_unit_tiles(imgunit, sliceunit); + sliceunit->state = slice_unit::Decoded; + mark_whole_slice_as_processed(imgunit,sliceunit,CTB_PROGRESS_PREFILTER); + return err; + } + + assert(false); + return err; +} + + +de265_error decoder_context::decode_slice_unit_WPP(image_unit* imgunit, + slice_unit* sliceunit) +{ + de265_error err = DE265_OK; + + de265_image* img = imgunit->img; + slice_segment_header* shdr = sliceunit->shdr; + const pic_parameter_set& pps = img->get_pps(); + + int nRows = shdr->num_entry_point_offsets +1; + int ctbsWidth = img->get_sps().PicWidthInCtbsY; + + + assert(img->num_threads_active() == 0); + + + // reserve space to store entropy coding context models for each CTB row + + if (shdr->first_slice_segment_in_pic_flag) { + // reserve space for nRows-1 because we don't need to save the CABAC model in the last CTB row + imgunit->ctx_models.resize( (img->get_sps().PicHeightInCtbsY-1) ); //* CONTEXT_MODEL_TABLE_LENGTH ); + } + + + sliceunit->allocate_thread_contexts(nRows); + + + // first CTB in this slice + int ctbAddrRS = shdr->slice_segment_address; + int ctbRow = ctbAddrRS / ctbsWidth; + + for (int entryPt=0;entryPt0) { + ctbRow++; + ctbAddrRS = ctbRow * ctbsWidth; + } + else if (nRows>1 && (ctbAddrRS % ctbsWidth) != 0) { + // If slice segment consists of several WPP rows, each of them + // has to start at a row. + + //printf("does not start at start\n"); + + err = DE265_WARNING_SLICEHEADER_INVALID; + break; + } + + + // prepare thread context + + thread_context* tctx = sliceunit->get_thread_context(entryPt); + + tctx->shdr = shdr; + tctx->decctx = img->decctx; + tctx->img = img; + tctx->imgunit = imgunit; + tctx->sliceunit= sliceunit; + tctx->CtbAddrInTS = pps.CtbAddrRStoTS[ctbAddrRS]; + + init_thread_context(tctx); + + + // init CABAC + + int dataStartIndex; + if (entryPt==0) { dataStartIndex=0; } + else { dataStartIndex=shdr->entry_point_offset[entryPt-1]; } + + int dataEnd; + if (entryPt==nRows-1) dataEnd = sliceunit->reader.bytes_remaining; + else dataEnd = shdr->entry_point_offset[entryPt]; + + if (dataStartIndex<0 || dataEnd>sliceunit->reader.bytes_remaining || + dataEnd <= dataStartIndex) { + //printf("WPP premature end\n"); + err = DE265_ERROR_PREMATURE_END_OF_SLICE; + break; + } + + init_CABAC_decoder(&tctx->cabac_decoder, + &sliceunit->reader.data[dataStartIndex], + dataEnd-dataStartIndex); + + // add task + + //printf("start task for ctb-row: %d\n",ctbRow); + img->thread_start(1); + sliceunit->nThreads++; + add_task_decode_CTB_row(tctx, entryPt==0, ctbRow); + } + +#if 0 + for (;;) { + printf("q:%d r:%d b:%d f:%d\n", + img->nThreadsQueued, + img->nThreadsRunning, + img->nThreadsBlocked, + img->nThreadsFinished); + + if (img->debug_is_completed()) break; + + usleep(1000); + } +#endif + + img->wait_for_completion(); + + for (int i=0;itasks.size();i++) + delete imgunit->tasks[i]; + imgunit->tasks.clear(); + + return DE265_OK; +} + +de265_error decoder_context::decode_slice_unit_tiles(image_unit* imgunit, + slice_unit* sliceunit) +{ + de265_error err = DE265_OK; + + de265_image* img = imgunit->img; + slice_segment_header* shdr = sliceunit->shdr; + const pic_parameter_set& pps = img->get_pps(); + + int nTiles = shdr->num_entry_point_offsets +1; + int ctbsWidth = img->get_sps().PicWidthInCtbsY; + + + assert(img->num_threads_active() == 0); + + sliceunit->allocate_thread_contexts(nTiles); + + + // first CTB in this slice + int ctbAddrRS = shdr->slice_segment_address; + int tileID = pps.TileIdRS[ctbAddrRS]; + + for (int entryPt=0;entryPt0) { + tileID++; + + if (tileID >= pps.num_tile_columns * pps.num_tile_rows) { + err = DE265_WARNING_SLICEHEADER_INVALID; + break; + } + + int ctbX = pps.colBd[tileID % pps.num_tile_columns]; + int ctbY = pps.rowBd[tileID / pps.num_tile_columns]; + ctbAddrRS = ctbY * ctbsWidth + ctbX; + } + + // set thread context + + thread_context* tctx = sliceunit->get_thread_context(entryPt); + + tctx->shdr = shdr; + tctx->decctx = img->decctx; + tctx->img = img; + tctx->imgunit = imgunit; + tctx->sliceunit= sliceunit; + tctx->CtbAddrInTS = pps.CtbAddrRStoTS[ctbAddrRS]; + + init_thread_context(tctx); + + + // init CABAC + + int dataStartIndex; + if (entryPt==0) { dataStartIndex=0; } + else { dataStartIndex=shdr->entry_point_offset[entryPt-1]; } + + int dataEnd; + if (entryPt==nTiles-1) dataEnd = sliceunit->reader.bytes_remaining; + else dataEnd = shdr->entry_point_offset[entryPt]; + + if (dataStartIndex<0 || dataEnd>sliceunit->reader.bytes_remaining || + dataEnd <= dataStartIndex) { + err = DE265_ERROR_PREMATURE_END_OF_SLICE; + break; + } + + init_CABAC_decoder(&tctx->cabac_decoder, + &sliceunit->reader.data[dataStartIndex], + dataEnd-dataStartIndex); + + // add task + + //printf("add tiles thread\n"); + img->thread_start(1); + sliceunit->nThreads++; + add_task_decode_slice_segment(tctx, entryPt==0, + ctbAddrRS % ctbsWidth, + ctbAddrRS / ctbsWidth); + } + + img->wait_for_completion(); + + for (int i=0;itasks.size();i++) + delete imgunit->tasks[i]; + imgunit->tasks.clear(); + + return err; +} + + +de265_error decoder_context::decode_NAL(NAL_unit* nal) +{ + //return decode_NAL_OLD(nal); + + decoder_context* ctx = this; + + de265_error err = DE265_OK; + + bitreader reader; + bitreader_init(&reader, nal->data(), nal->size()); + + nal_header nal_hdr; + nal_hdr.read(&reader); + ctx->process_nal_hdr(&nal_hdr); + + if (nal_hdr.nuh_layer_id > 0) { + // Discard all NAL units with nuh_layer_id > 0 + // These will have to be handeled by an SHVC decoder. + nal_parser.free_NAL_unit(nal); + return DE265_OK; + } + + loginfo(LogHighlevel,"NAL: 0x%x 0x%x - unit type:%s temporal id:%d\n", + nal->data()[0], nal->data()[1], + get_NAL_name(nal_hdr.nal_unit_type), + nal_hdr.nuh_temporal_id); + + /* + printf("NAL: 0x%x 0x%x - unit type:%s temporal id:%d\n", + nal->data()[0], nal->data()[1], + get_NAL_name(nal_hdr.nal_unit_type), + nal_hdr.nuh_temporal_id); + */ + + // throw away NALs from higher TIDs than currently selected + // TODO: better online switching of HighestTID + + //printf("hTid: %d\n", current_HighestTid); + + if (nal_hdr.nuh_temporal_id > current_HighestTid) { + nal_parser.free_NAL_unit(nal); + return DE265_OK; + } + + + if (nal_hdr.nal_unit_type<32) { + err = read_slice_NAL(reader, nal, nal_hdr); + } + else switch (nal_hdr.nal_unit_type) { + case NAL_UNIT_VPS_NUT: + err = read_vps_NAL(reader); + nal_parser.free_NAL_unit(nal); + break; + + case NAL_UNIT_SPS_NUT: + err = read_sps_NAL(reader); + nal_parser.free_NAL_unit(nal); + break; + + case NAL_UNIT_PPS_NUT: + err = read_pps_NAL(reader); + nal_parser.free_NAL_unit(nal); + break; + + case NAL_UNIT_PREFIX_SEI_NUT: + case NAL_UNIT_SUFFIX_SEI_NUT: + err = read_sei_NAL(reader, nal_hdr.nal_unit_type==NAL_UNIT_SUFFIX_SEI_NUT); + nal_parser.free_NAL_unit(nal); + break; + + case NAL_UNIT_EOS_NUT: + ctx->FirstAfterEndOfSequenceNAL = true; + nal_parser.free_NAL_unit(nal); + break; + + default: + nal_parser.free_NAL_unit(nal); + break; + } + + return err; +} + + +de265_error decoder_context::decode(int* more) +{ + decoder_context* ctx = this; + + // if the stream has ended, and no more NALs are to be decoded, flush all pictures + + if (ctx->nal_parser.get_NAL_queue_length() == 0 && + (ctx->nal_parser.is_end_of_stream() || ctx->nal_parser.is_end_of_frame()) && + ctx->image_units.empty()) { + + // flush all pending pictures into output queue + + // ctx->push_current_picture_to_output_queue(); // TODO: not with new queue + ctx->dpb.flush_reorder_buffer(); + + if (more) { *more = ctx->dpb.num_pictures_in_output_queue(); } + + return DE265_OK; + } + + + // if NAL-queue is empty, we need more data + // -> input stalled + + if (ctx->nal_parser.is_end_of_stream() == false && + ctx->nal_parser.is_end_of_frame() == false && + ctx->nal_parser.get_NAL_queue_length() == 0) { + if (more) { *more=1; } + + return DE265_ERROR_WAITING_FOR_INPUT_DATA; + } + + + // when there are no free image buffers in the DPB, pause decoding + // -> output stalled + + if (!ctx->dpb.has_free_dpb_picture(false)) { + if (more) *more = 1; + return DE265_ERROR_IMAGE_BUFFER_FULL; + } + + + // decode one NAL from the queue + + de265_error err = DE265_OK; + bool did_work = false; + + if (ctx->nal_parser.get_NAL_queue_length()) { // number_of_NAL_units_pending()) { + NAL_unit* nal = ctx->nal_parser.pop_from_NAL_queue(); + assert(nal); + err = ctx->decode_NAL(nal); + // ctx->nal_parser.free_NAL_unit(nal); TODO: do not free NAL with new loop + did_work=true; + } + else if (ctx->nal_parser.is_end_of_frame() == true && + ctx->image_units.empty()) { + if (more) { *more=1; } + + return DE265_ERROR_WAITING_FOR_INPUT_DATA; + } + else { + err = decode_some(&did_work); + } + + if (more) { + // decoding error is assumed to be unrecoverable + *more = (err==DE265_OK && did_work); + } + + return err; +} + + +void decoder_context::process_nal_hdr(nal_header* nal) +{ + nal_unit_type = nal->nal_unit_type; + + IdrPicFlag = isIdrPic(nal->nal_unit_type); + RapPicFlag = isRapPic(nal->nal_unit_type); +} + + + +/* 8.3.1 + */ +void decoder_context::process_picture_order_count(slice_segment_header* hdr) +{ + loginfo(LogHeaders,"POC computation. lsb:%d prev.pic.lsb:%d msb:%d\n", + hdr->slice_pic_order_cnt_lsb, + prevPicOrderCntLsb, + PicOrderCntMsb); + + if (isIRAP(nal_unit_type) && + NoRaslOutputFlag) + { + PicOrderCntMsb=0; + + + // flush all images from reorder buffer + + flush_reorder_buffer_at_this_frame = true; + //ctx->dpb.flush_reorder_buffer(); + } + else + { + int MaxPicOrderCntLsb = current_sps->MaxPicOrderCntLsb; + + if ((hdr->slice_pic_order_cnt_lsb < prevPicOrderCntLsb) && + (prevPicOrderCntLsb - hdr->slice_pic_order_cnt_lsb) >= MaxPicOrderCntLsb/2) { + PicOrderCntMsb = prevPicOrderCntMsb + MaxPicOrderCntLsb; + } + else if ((hdr->slice_pic_order_cnt_lsb > prevPicOrderCntLsb) && + (hdr->slice_pic_order_cnt_lsb - prevPicOrderCntLsb) > MaxPicOrderCntLsb/2) { + PicOrderCntMsb = prevPicOrderCntMsb - MaxPicOrderCntLsb; + } + else { + PicOrderCntMsb = prevPicOrderCntMsb; + } + } + + img->PicOrderCntVal = PicOrderCntMsb + hdr->slice_pic_order_cnt_lsb; + img->picture_order_cnt_lsb = hdr->slice_pic_order_cnt_lsb; + + loginfo(LogHeaders,"POC computation. new msb:%d POC=%d\n", + PicOrderCntMsb, + img->PicOrderCntVal); + + if (img->nal_hdr.nuh_temporal_id==0 && + !isSublayerNonReference(nal_unit_type) && + !isRASL(nal_unit_type) && + !isRADL(nal_unit_type)) + { + loginfo(LogHeaders,"set prevPicOrderCntLsb/Msb\n"); + + prevPicOrderCntLsb = hdr->slice_pic_order_cnt_lsb; + prevPicOrderCntMsb = PicOrderCntMsb; + } +} + + +/* 8.3.3.2 + Returns DPB index of the generated picture. + */ +int decoder_context::generate_unavailable_reference_picture(const seq_parameter_set* sps, + int POC, bool longTerm) +{ + assert(dpb.has_free_dpb_picture(true)); + + std::shared_ptr current_sps = this->sps[ (int)current_pps->seq_parameter_set_id ]; + + int idx = dpb.new_image(current_sps, this, 0,0, false); + assert(idx>=0); + //printf("-> fill with unavailable POC %d\n",POC); + + de265_image* img = dpb.get_image(idx); + + img->fill_image(1<<(sps->BitDepth_Y-1), + 1<<(sps->BitDepth_C-1), + 1<<(sps->BitDepth_C-1)); + + img->fill_pred_mode(MODE_INTRA); + + img->PicOrderCntVal = POC; + img->picture_order_cnt_lsb = POC & (sps->MaxPicOrderCntLsb-1); + img->PicOutputFlag = false; + img->PicState = (longTerm ? UsedForLongTermReference : UsedForShortTermReference); + img->integrity = INTEGRITY_UNAVAILABLE_REFERENCE; + + return idx; +} + + +/* 8.3.2 invoked once per picture + + This function will mark pictures in the DPB as 'unused' or 'used for long-term reference' + */ +void decoder_context::process_reference_picture_set(slice_segment_header* hdr) +{ + std::vector removeReferencesList; + + const int currentID = img->get_ID(); + + + if (isIRAP(nal_unit_type) && NoRaslOutputFlag) { + + int currentPOC = img->PicOrderCntVal; + + // reset DPB + + /* The standard says: "When the current picture is an IRAP picture with NoRaslOutputFlag + equal to 1, all reference pictures currently in the DPB (if any) are marked as + "unused for reference". + + This seems to be wrong as it also throws out the first CRA picture in a stream like + RAP_A (decoding order: CRA,POC=64, RASL,POC=60). Removing only the pictures with + lower POCs seems to be compliant to the reference decoder. + */ + + for (int i=0;iPicState != UnusedForReference && + img->PicOrderCntVal < currentPOC && + img->removed_at_picture_id > img->get_ID()) { + + removeReferencesList.push_back(img->get_ID()); + img->removed_at_picture_id = img->get_ID(); + + //printf("will remove ID %d (a)\n",img->get_ID()); + } + } + } + + + if (isIDR(nal_unit_type)) { + + // clear all reference pictures + + NumPocStCurrBefore = 0; + NumPocStCurrAfter = 0; + NumPocStFoll = 0; + NumPocLtCurr = 0; + NumPocLtFoll = 0; + } + else { + const ref_pic_set* rps = &hdr->CurrRps; + + // (8-98) + + int i,j,k; + + // scan ref-pic-set for smaller POCs and fill into PocStCurrBefore / PocStFoll + + for (i=0, j=0, k=0; + iNumNegativePics; + i++) + { + if (rps->UsedByCurrPicS0[i]) { + PocStCurrBefore[j++] = img->PicOrderCntVal + rps->DeltaPocS0[i]; + //printf("PocStCurrBefore = %d\n",PocStCurrBefore[j-1]); + } + else { + PocStFoll[k++] = img->PicOrderCntVal + rps->DeltaPocS0[i]; + } + } + + NumPocStCurrBefore = j; + + + // scan ref-pic-set for larger POCs and fill into PocStCurrAfter / PocStFoll + + for (i=0, j=0; + iNumPositivePics; + i++) + { + if (rps->UsedByCurrPicS1[i]) { + PocStCurrAfter[j++] = img->PicOrderCntVal + rps->DeltaPocS1[i]; + //printf("PocStCurrAfter = %d\n",PocStCurrAfter[j-1]); + } + else { + PocStFoll[k++] = img->PicOrderCntVal + rps->DeltaPocS1[i]; + } + } + + NumPocStCurrAfter = j; + NumPocStFoll = k; + + + // find used / future long-term references + + for (i=0, j=0, k=0; + //inum_long_term_ref_pics_sps + hdr->num_long_term_pics; + inum_long_term_sps + hdr->num_long_term_pics; + i++) + { + int pocLt = PocLsbLt[i]; + + if (hdr->delta_poc_msb_present_flag[i]) { + int currentPictureMSB = img->PicOrderCntVal - hdr->slice_pic_order_cnt_lsb; + pocLt += currentPictureMSB + - DeltaPocMsbCycleLt[i] * current_sps->MaxPicOrderCntLsb; + } + + if (UsedByCurrPicLt[i]) { + PocLtCurr[j] = pocLt; + CurrDeltaPocMsbPresentFlag[j] = hdr->delta_poc_msb_present_flag[i]; + j++; + } + else { + PocLtFoll[k] = pocLt; + FollDeltaPocMsbPresentFlag[k] = hdr->delta_poc_msb_present_flag[i]; + k++; + } + } + + NumPocLtCurr = j; + NumPocLtFoll = k; + } + + + // (old 8-99) / (new 8-106) + // 1. + + std::vector picInAnyList(dpb.size(), false); + + + dpb.log_dpb_content(); + + for (int i=0;i=0) picInAnyList[k]=true; + else { + // TODO, CHECK: is it ok that we generate a picture with POC = LSB (PocLtCurr) + // We do not know the correct MSB + int concealedPicture = generate_unavailable_reference_picture(current_sps.get(), + PocLtCurr[i], true); + picInAnyList.resize(dpb.size(), false); // adjust size of array to hold new picture + + RefPicSetLtCurr[i] = k = concealedPicture; + picInAnyList[concealedPicture]=true; + } + + if (dpb.get_image(k)->integrity != INTEGRITY_CORRECT) { + img->integrity = INTEGRITY_DERIVED_FROM_FAULTY_REFERENCE; + } + } + + + for (int i=0;i=0) picInAnyList[k]=true; + else { + int concealedPicture = k = generate_unavailable_reference_picture(current_sps.get(), + PocLtFoll[i], true); + picInAnyList.resize(dpb.size(), false); // adjust size of array to hold new picture + + RefPicSetLtFoll[i] = concealedPicture; + picInAnyList[concealedPicture]=true; + } + } + + + // 2. Mark all pictures in RefPicSetLtCurr / RefPicSetLtFoll as UsedForLongTermReference + + for (int i=0;iPicState = UsedForLongTermReference; + } + + for (int i=0;iPicState = UsedForLongTermReference; + } + + + // 3. + + for (int i=0;i idx=%d\n",PocStCurrBefore[i], k); + + RefPicSetStCurrBefore[i] = k; // -1 == "no reference picture" + if (k>=0) picInAnyList[k]=true; + else { + int concealedPicture = generate_unavailable_reference_picture(current_sps.get(), + PocStCurrBefore[i], false); + RefPicSetStCurrBefore[i] = k = concealedPicture; + + picInAnyList.resize(dpb.size(), false); // adjust size of array to hold new picture + picInAnyList[concealedPicture] = true; + + //printf(" concealed: %d\n", concealedPicture); + } + + if (dpb.get_image(k)->integrity != INTEGRITY_CORRECT) { + img->integrity = INTEGRITY_DERIVED_FROM_FAULTY_REFERENCE; + } + } + + for (int i=0;i idx=%d\n",PocStCurrAfter[i], k); + + RefPicSetStCurrAfter[i] = k; // -1 == "no reference picture" + if (k>=0) picInAnyList[k]=true; + else { + int concealedPicture = generate_unavailable_reference_picture(current_sps.get(), + PocStCurrAfter[i], false); + RefPicSetStCurrAfter[i] = k = concealedPicture; + + + picInAnyList.resize(dpb.size(), false); // adjust size of array to hold new picture + picInAnyList[concealedPicture]=true; + + //printf(" concealed: %d\n", concealedPicture); + } + + if (dpb.get_image(k)->integrity != INTEGRITY_CORRECT) { + img->integrity = INTEGRITY_DERIVED_FROM_FAULTY_REFERENCE; + } + } + + for (int i=0;i=0) picInAnyList[k]=true; + } + + // 4. any picture that is not marked for reference is put into the "UnusedForReference" state + + for (int i=0;i=picInAnyList.size() || !picInAnyList[i]) // no reference + { + de265_image* dpbimg = dpb.get_image(i); + if (dpbimg != img && // not the current picture + dpbimg->removed_at_picture_id > img->get_ID()) // has not been removed before + { + if (dpbimg->PicState != UnusedForReference) { + removeReferencesList.push_back(dpbimg->get_ID()); + //printf("will remove ID %d (b)\n",dpbimg->get_ID()); + + dpbimg->removed_at_picture_id = img->get_ID(); + } + } + } + + hdr->RemoveReferencesList = removeReferencesList; + + //remove_images_from_dpb(hdr->RemoveReferencesList); +} + + +// 8.3.4 +// Returns whether we can continue decoding (or whether there is a severe error). +/* Called at beginning of each slice. + + Constructs + - the RefPicList[2][], containing indices into the DPB, and + - the RefPicList_POC[2][], containing POCs. + - LongTermRefPic[2][] is also set to true if it is a long-term reference + */ +bool decoder_context::construct_reference_picture_lists(slice_segment_header* hdr) +{ + int NumPocTotalCurr = hdr->NumPocTotalCurr; + int NumRpsCurrTempList0 = libde265_max(hdr->num_ref_idx_l0_active, NumPocTotalCurr); + + // TODO: fold code for both lists together + + int RefPicListTemp0[3*MAX_NUM_REF_PICS]; // TODO: what would be the correct maximum ? + int RefPicListTemp1[3*MAX_NUM_REF_PICS]; // TODO: what would be the correct maximum ? + char isLongTerm[2][3*MAX_NUM_REF_PICS]; + + memset(isLongTerm,0,2*3*MAX_NUM_REF_PICS); + + /* --- Fill RefPicListTmp0 with reference pictures in this order: + 1) short term, past POC + 2) short term, future POC + 3) long term + */ + + int rIdx=0; + while (rIdx < NumRpsCurrTempList0) { + for (int i=0;inum_ref_idx_l0_active > 16) { + add_warning(DE265_WARNING_NONEXISTING_REFERENCE_PICTURE_ACCESSED, false); + return false; + } + */ + + assert(hdr->num_ref_idx_l0_active <= 16); + for (rIdx=0; rIdxnum_ref_idx_l0_active; rIdx++) { + int idx = hdr->ref_pic_list_modification_flag_l0 ? hdr->list_entry_l0[rIdx] : rIdx; + + hdr->RefPicList[0][rIdx] = RefPicListTemp0[idx]; + hdr->LongTermRefPic[0][rIdx] = isLongTerm[0][idx]; + + // remember POC of referenced image (needed in motion.c, derive_collocated_motion_vector) + de265_image* img_0_rIdx = dpb.get_image(hdr->RefPicList[0][rIdx]); + if (img_0_rIdx==NULL) { + return false; + } + hdr->RefPicList_POC[0][rIdx] = img_0_rIdx->PicOrderCntVal; + hdr->RefPicList_PicState[0][rIdx] = img_0_rIdx->PicState; + } + + + /* --- Fill RefPicListTmp1 with reference pictures in this order: + 1) short term, future POC + 2) short term, past POC + 3) long term + */ + + if (hdr->slice_type == SLICE_TYPE_B) { + int NumRpsCurrTempList1 = libde265_max(hdr->num_ref_idx_l1_active, NumPocTotalCurr); + + int rIdx=0; + while (rIdx < NumRpsCurrTempList1) { + for (int i=0;inum_ref_idx_l0_active > 16) { + add_warning(DE265_WARNING_NONEXISTING_REFERENCE_PICTURE_ACCESSED, false); + return false; + } + + assert(hdr->num_ref_idx_l1_active <= 16); + for (rIdx=0; rIdxnum_ref_idx_l1_active; rIdx++) { + int idx = hdr->ref_pic_list_modification_flag_l1 ? hdr->list_entry_l1[rIdx] : rIdx; + + hdr->RefPicList[1][rIdx] = RefPicListTemp1[idx]; + hdr->LongTermRefPic[1][rIdx] = isLongTerm[1][idx]; + + // remember POC of referenced imaged (needed in motion.c, derive_collocated_motion_vector) + de265_image* img_1_rIdx = dpb.get_image(hdr->RefPicList[1][rIdx]); + if (img_1_rIdx == NULL) { return false; } + hdr->RefPicList_POC[1][rIdx] = img_1_rIdx->PicOrderCntVal; + hdr->RefPicList_PicState[1][rIdx] = img_1_rIdx->PicState; + } + } + + + // show reference picture lists + + loginfo(LogHeaders,"RefPicList[0] ="); + for (rIdx=0; rIdxnum_ref_idx_l0_active; rIdx++) { + loginfo(LogHeaders,"* [%d]=%d (LT=%d)", + hdr->RefPicList[0][rIdx], + hdr->RefPicList_POC[0][rIdx], + hdr->LongTermRefPic[0][rIdx] + ); + } + loginfo(LogHeaders,"*\n"); + + if (hdr->slice_type == SLICE_TYPE_B) { + loginfo(LogHeaders,"RefPicList[1] ="); + for (rIdx=0; rIdxnum_ref_idx_l1_active; rIdx++) { + loginfo(LogHeaders,"* [%d]=%d (LT=%d)", + hdr->RefPicList[1][rIdx], + hdr->RefPicList_POC[1][rIdx], + hdr->LongTermRefPic[1][rIdx] + ); + } + loginfo(LogHeaders,"*\n"); + } + + return true; +} + + + +void decoder_context::run_postprocessing_filters_sequential(de265_image* img) +{ +#if SAVE_INTERMEDIATE_IMAGES + char buf[1000]; + sprintf(buf,"pre-lf-%05d.yuv", img->PicOrderCntVal); + write_picture_to_file(img, buf); +#endif + + if (!img->decctx->param_disable_deblocking) { + apply_deblocking_filter(img); + } + +#if SAVE_INTERMEDIATE_IMAGES + sprintf(buf,"pre-sao-%05d.yuv", img->PicOrderCntVal); + write_picture_to_file(img, buf); +#endif + + if (!img->decctx->param_disable_sao) { + apply_sample_adaptive_offset_sequential(img); + } + +#if SAVE_INTERMEDIATE_IMAGES + sprintf(buf,"sao-%05d.yuv", img->PicOrderCntVal); + write_picture_to_file(img, buf); +#endif +} + + +void decoder_context::run_postprocessing_filters_parallel(image_unit* imgunit) +{ + de265_image* img = imgunit->img; + + int saoWaitsForProgress = CTB_PROGRESS_PREFILTER; + bool waitForCompletion = false; + + if (!img->decctx->param_disable_deblocking) { + add_deblocking_tasks(imgunit); + saoWaitsForProgress = CTB_PROGRESS_DEBLK_H; + } + + if (!img->decctx->param_disable_sao) { + waitForCompletion |= add_sao_tasks(imgunit, saoWaitsForProgress); + //apply_sample_adaptive_offset(img); + } + + img->wait_for_completion(); +} + +/* +void decoder_context::push_current_picture_to_output_queue() +{ + push_picture_to_output_queue(img); +} +*/ + +de265_error decoder_context::push_picture_to_output_queue(image_unit* imgunit) +{ + de265_image* outimg = imgunit->img; + + if (outimg==NULL) { return DE265_OK; } + + + // push image into output queue + + if (outimg->PicOutputFlag) { + loginfo(LogDPB,"new picture has output-flag=true\n"); + + if (outimg->integrity != INTEGRITY_CORRECT && + param_suppress_faulty_pictures) { + } + else { + dpb.insert_image_into_reorder_buffer(outimg); + } + + loginfo(LogDPB,"push image %d into reordering queue\n", outimg->PicOrderCntVal); + } + + // check for full reorder buffers + + int maxNumPicsInReorderBuffer = 0; + + // TODO: I'd like to have the has_vps() check somewhere else (not decode the picture at all) + if (outimg->has_vps()) { + int sublayer = outimg->get_vps().vps_max_sub_layers -1; + maxNumPicsInReorderBuffer = outimg->get_vps().layer[sublayer].vps_max_num_reorder_pics; + } + + if (dpb.num_pictures_in_reorder_buffer() > maxNumPicsInReorderBuffer) { + dpb.output_next_picture_in_reorder_buffer(); + } + + dpb.log_dpb_queues(); + + return DE265_OK; +} + + +// returns whether we can continue decoding the stream or whether we should give up +bool decoder_context::process_slice_segment_header(slice_segment_header* hdr, + de265_error* err, de265_PTS pts, + nal_header* nal_hdr, + void* user_data) +{ + *err = DE265_OK; + + flush_reorder_buffer_at_this_frame = false; + + + // get PPS and SPS for this slice + + int pps_id = hdr->slice_pic_parameter_set_id; + if (pps[pps_id]->pps_read==false) { + logerror(LogHeaders, "PPS %d has not been read\n", pps_id); + assert(false); // TODO + } + + current_pps = pps[pps_id]; + current_sps = sps[ (int)current_pps->seq_parameter_set_id ]; + current_vps = vps[ (int)current_sps->video_parameter_set_id ]; + + calc_tid_and_framerate_ratio(); + + + // --- prepare decoding of new picture --- + + if (hdr->first_slice_segment_in_pic_flag) { + + // previous picture has been completely decoded + + //ctx->push_current_picture_to_output_queue(); + + current_image_poc_lsb = hdr->slice_pic_order_cnt_lsb; + + + seq_parameter_set* sps = current_sps.get(); + + + // --- find and allocate image buffer for decoding --- + + int image_buffer_idx; + bool isOutputImage = (!sps->sample_adaptive_offset_enabled_flag || param_disable_sao); + image_buffer_idx = dpb.new_image(current_sps, this, pts, user_data, isOutputImage); + if (image_buffer_idx == -1) { + *err = DE265_ERROR_IMAGE_BUFFER_FULL; + return false; + } + + /*de265_image* */ img = dpb.get_image(image_buffer_idx); + img->nal_hdr = *nal_hdr; + + // Note: sps is already set in new_image() -> ??? still the case with shared_ptr ? + + img->set_headers(current_vps, current_sps, current_pps); + + img->decctx = this; + + img->clear_metadata(); + + + if (isIRAP(nal_unit_type)) { + if (isIDR(nal_unit_type) || + isBLA(nal_unit_type) || + first_decoded_picture || + FirstAfterEndOfSequenceNAL) + { + NoRaslOutputFlag = true; + FirstAfterEndOfSequenceNAL = false; + } + else if (0) // TODO: set HandleCraAsBlaFlag by external means + { + } + else + { + NoRaslOutputFlag = false; + HandleCraAsBlaFlag = false; + } + } + + + if (isRASL(nal_unit_type) && + NoRaslOutputFlag) + { + img->PicOutputFlag = false; + } + else + { + img->PicOutputFlag = !!hdr->pic_output_flag; + } + + process_picture_order_count(hdr); + + if (hdr->first_slice_segment_in_pic_flag) { + // mark picture so that it is not overwritten by unavailable reference frames + img->PicState = UsedForShortTermReference; + + process_reference_picture_set(hdr); + } + + img->PicState = UsedForShortTermReference; + + log_set_current_POC(img->PicOrderCntVal); + + + // next image is not the first anymore + + first_decoded_picture = false; + } + else { + // claims to be not the first slice, but there is no active image available + + if (img == NULL) { + return false; + } + } + + if (hdr->slice_type == SLICE_TYPE_B || + hdr->slice_type == SLICE_TYPE_P) + { + bool success = construct_reference_picture_lists(hdr); + if (!success) { + return false; + } + } + + //printf("process slice segment header\n"); + + loginfo(LogHeaders,"end of process-slice-header\n"); + dpb.log_dpb_content(); + + + if (hdr->dependent_slice_segment_flag==0) { + hdr->SliceAddrRS = hdr->slice_segment_address; + } else { + hdr->SliceAddrRS = previous_slice_header->SliceAddrRS; + } + + previous_slice_header = hdr; + + + loginfo(LogHeaders,"SliceAddrRS = %d\n",hdr->SliceAddrRS); + + return true; +} + + +void decoder_context::remove_images_from_dpb(const std::vector& removeImageList) +{ + for (int i=0;i=0) { + //printf("remove ID %d\n", removeImageList[i]); + de265_image* dpbimg = dpb.get_image( idx ); + dpbimg->PicState = UnusedForReference; + } + } +} + + + +/* + . 0 1 2 <- goal_HighestTid + +-----+-----+-----+ + | -0->| -1->| -2->| + +-----+-----+-----+ + 0 33 66 100 <- framerate_ratio + */ + +int decoder_context::get_highest_TID() const +{ + if (current_sps) { return current_sps->sps_max_sub_layers-1; } + if (current_vps) { return current_vps->vps_max_sub_layers-1; } + + return 6; +} + +void decoder_context::set_limit_TID(int max_tid) +{ + limit_HighestTid = max_tid; + calc_tid_and_framerate_ratio(); +} + +int decoder_context::change_framerate(int more) +{ + if (current_sps == NULL) { return framerate_ratio; } + + int highestTid = get_highest_TID(); + + assert(more>=-1 && more<=1); + + goal_HighestTid += more; + goal_HighestTid = std::max(goal_HighestTid, 0); + goal_HighestTid = std::min(goal_HighestTid, highestTid); + + framerate_ratio = framedrop_tid_index[goal_HighestTid]; + + calc_tid_and_framerate_ratio(); + + return framerate_ratio; +} + +void decoder_context::set_framerate_ratio(int percent) +{ + framerate_ratio = percent; + calc_tid_and_framerate_ratio(); +} + +void decoder_context::compute_framedrop_table() +{ + int highestTID = get_highest_TID(); + + for (int tid=highestTID ; tid>=0 ; tid--) { + int lower = 100 * tid /(highestTID+1); + int higher = 100 * (tid+1)/(highestTID+1); + + for (int l=lower; l<=higher; l++) { + int ratio = 100 * (l-lower) / (higher-lower); + + // if we would exceed our TID limit, decode the highest TID at full frame-rate + if (tid > limit_HighestTid) { + tid = limit_HighestTid; + ratio = 100; + } + + framedrop_tab[l].tid = tid; + framedrop_tab[l].ratio = ratio; + } + + framedrop_tid_index[tid] = higher; + } + +#if 0 + for (int i=0;i<=100;i++) { + printf("%d%%: %d/%d",i, framedrop_tab[i].tid, framedrop_tab[i].ratio); + for (int k=0;k<=highestTID;k++) { + if (framedrop_tid_index[k] == i) printf(" ** TID=%d **",k); + } + printf("\n"); + } +#endif +} + +void decoder_context::calc_tid_and_framerate_ratio() +{ + int highestTID = get_highest_TID(); + + + // if number of temporal layers changed, we have to recompute the framedrop table + + if (framedrop_tab[100].tid != highestTID) { + compute_framedrop_table(); + } + + goal_HighestTid = framedrop_tab[framerate_ratio].tid; + layer_framerate_ratio = framedrop_tab[framerate_ratio].ratio; + + // TODO: for now, we switch immediately + current_HighestTid = goal_HighestTid; +} + + +void error_queue::add_warning(de265_error warning, bool once) +{ + // check if warning was already shown + bool add=true; + if (once) { + for (int i=0;i + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "dpb.h" +#include "decctx.h" +#include +#include + + +#define DPB_DEFAULT_MAX_IMAGES 30 + + +decoded_picture_buffer::decoded_picture_buffer() +{ + max_images_in_DPB = DPB_DEFAULT_MAX_IMAGES; + norm_images_in_DPB = DPB_DEFAULT_MAX_IMAGES; +} + + +decoded_picture_buffer::~decoded_picture_buffer() +{ + for (int i=0;iPicOrderCntVal, + dpb[i]->get_ID(), + dpb[i]->PicState == UnusedForReference ? "unused" : + dpb[i]->PicState == UsedForShortTermReference ? "short-term" : "long-term", + dpb[i]->PicOutputFlag ? "output" : "---"); + } +} + + +bool decoded_picture_buffer::has_free_dpb_picture(bool high_priority) const +{ + // we will always adapt the buffer to insert high-priority images + if (high_priority) return true; + + // quick test to check for free slots + if (dpb.size() < max_images_in_DPB) return true; + + // scan for empty slots + for (int i=0;iPicOutputFlag==false && dpb[i]->PicState == UnusedForReference) { + return true; + } + } + + return false; +} + + +int decoded_picture_buffer::DPB_index_of_picture_with_POC(int poc, int currentID, bool preferLongTerm) const +{ + logdebug(LogHeaders,"DPB_index_of_picture_with_POC POC=%d\n",poc); + + //log_dpb_content(ctx); + //loginfo(LogDPB,"searching for short-term reference POC=%d\n",poc); + + if (preferLongTerm) { + for (int k=0;kPicOrderCntVal == poc && + dpb[k]->removed_at_picture_id > currentID && + dpb[k]->PicState == UsedForLongTermReference) { + return k; + } + } + } + + for (int k=0;kPicOrderCntVal == poc && + dpb[k]->removed_at_picture_id > currentID && + dpb[k]->PicState != UnusedForReference) { + return k; + } + } + + return -1; +} + + +int decoded_picture_buffer::DPB_index_of_picture_with_LSB(int lsb, int currentID, bool preferLongTerm) const +{ + logdebug(LogHeaders,"get access to picture with LSB %d from DPB\n",lsb); + + if (preferLongTerm) { + for (int k=0;kpicture_order_cnt_lsb == lsb && + dpb[k]->removed_at_picture_id > currentID && + dpb[k]->PicState == UsedForLongTermReference) { + return k; + } + } + } + + for (int k=0;kpicture_order_cnt_lsb == lsb && + dpb[k]->removed_at_picture_id > currentID && + dpb[k]->PicState != UnusedForReference) { + return k; + } + } + + return -1; +} + + +int decoded_picture_buffer::DPB_index_of_picture_with_ID(int id) const +{ + logdebug(LogHeaders,"get access to picture with ID %d from DPB\n",id); + + for (int k=0;kget_ID() == id) { + return k; + } + } + + return -1; +} + + +void decoded_picture_buffer::output_next_picture_in_reorder_buffer() +{ + assert(!reorder_output_queue.empty()); + + // search for picture in reorder buffer with minimum POC + + int minPOC = reorder_output_queue[0]->PicOrderCntVal; + int minIdx = 0; + for (int i=1;iPicOrderCntVal < minPOC) { + minPOC = reorder_output_queue[i]->PicOrderCntVal; + minIdx = i; + } + } + + + // put image into output queue + + image_output_queue.push_back(reorder_output_queue[minIdx]); + + + // remove image from reorder buffer + + reorder_output_queue[minIdx] = reorder_output_queue.back(); + reorder_output_queue.pop_back(); +} + + +bool decoded_picture_buffer::flush_reorder_buffer() +{ + // return 'false' when there are no pictures in reorder buffer + if (reorder_output_queue.empty()) return false; + + while (!reorder_output_queue.empty()) { + output_next_picture_in_reorder_buffer(); + } + + return true; +} + + +void decoded_picture_buffer::clear() +{ + for (int i=0;iPicOutputFlag || + dpb[i]->PicState != UnusedForReference) + { + dpb[i]->PicOutputFlag = false; + dpb[i]->PicState = UnusedForReference; + dpb[i]->release(); + } + } + + reorder_output_queue.clear(); + image_output_queue.clear(); +} + + +int decoded_picture_buffer::new_image(std::shared_ptr sps, + decoder_context* decctx, + de265_PTS pts, void* user_data, bool isOutputImage) +{ + loginfo(LogHeaders,"DPB::new_image\n"); + log_dpb_content(); + + // --- search for a free slot in the DPB --- + + int free_image_buffer_idx = -1; + for (int i=0;ican_be_released()) { + dpb[i]->release(); /* TODO: this is surely not the best place to free the image, but + we have to do it here because releasing it in de265_release_image() + would break the API compatibility. */ + + free_image_buffer_idx = i; + break; + } + } + + + // Try to free a buffer at the end if the DPB got too large. + /* This should also probably move to a better place as soon as the API allows for this. */ + + if (dpb.size() > norm_images_in_DPB && // buffer too large + free_image_buffer_idx != dpb.size()-1 && // last slot not reused in this alloc + dpb.back()->can_be_released()) // last slot is free + { + delete dpb.back(); + dpb.pop_back(); + } + + + // create a new image slot if no empty slot remaining + + if (free_image_buffer_idx == -1) { + free_image_buffer_idx = dpb.size(); + dpb.push_back(new de265_image); + } + + + // --- allocate new image --- + + de265_image* img = dpb[free_image_buffer_idx]; + + int w = sps->pic_width_in_luma_samples; + int h = sps->pic_height_in_luma_samples; + + enum de265_chroma chroma; + switch (sps->chroma_format_idc) { + case 0: chroma = de265_chroma_mono; break; + case 1: chroma = de265_chroma_420; break; + case 2: chroma = de265_chroma_422; break; + case 3: chroma = de265_chroma_444; break; + default: chroma = de265_chroma_420; assert(0); break; // should never happen + } + + img->alloc_image(w,h, chroma, sps, true, decctx, /*NULL,*/ pts, user_data, isOutputImage); + + img->integrity = INTEGRITY_CORRECT; + + return free_image_buffer_idx; +} + + +void decoded_picture_buffer::pop_next_picture_in_output_queue() +{ + image_output_queue.pop_front(); + + + loginfo(LogDPB, "DPB output queue: "); + for (int i=0;iPicOrderCntVal); + } + loginfo(LogDPB,"*\n"); +} + + +void decoded_picture_buffer::log_dpb_queues() const +{ + loginfo(LogDPB, "DPB reorder queue (after push): "); + for (int i=0;iPicOrderCntVal); + } + loginfo(LogDPB,"*\n"); + + loginfo(LogDPB, "DPB output queue (after push): "); + for (int i=0;iPicOrderCntVal); + } + loginfo(LogDPB,"*\n"); +} diff --git a/fallback-dct.cc b/fallback-dct.cc new file mode 100644 index 0000000..2e99c7c --- /dev/null +++ b/fallback-dct.cc @@ -0,0 +1,1210 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "fallback-dct.h" + +#if defined(_MSC_VER) || defined(__MINGW32__) +# include +#elif defined(HAVE_ALLOCA_H) +# include +#endif + +#include +#include + + +static void printMatrix(const char* name, const int16_t* v, int n) +{ + printf("--- %s ---\n",name); + for (int r=0;r>bdShift2; + + dst[y*stride+x] = Clip1_8bit(dst[y*stride+x] + c); + } +} + + +void transform_skip_16_fallback(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth) +{ + int nT = 4; + int bdShift2 = 20-bit_depth; + + assert(0); // DEPRECATED, should not be used anymore because of fixed 4x4 size + + for (int y=0;y>bdShift2; + + dst[y*stride+x] = Clip_BitDepth(dst[y*stride+x] + c, bit_depth); + } +} + + +void transform_skip_residual_fallback(int32_t *residual, const int16_t *coeffs, int nT, + int tsShift,int bdShift) +{ + const int rnd = 1<<(bdShift-1); + + for (int y=0;y> bdShift; + } +} + + +void transform_skip_rdpcm_v_8_fallback(uint8_t *dst, const int16_t *coeffs, int log2nT, ptrdiff_t stride) +{ + int bitDepth = 8; + int bdShift2 = 20-bitDepth; + int offset = (1<<(bdShift2-1)); + int tsShift = 5 + log2nT; // TODO: extended_precision + int nT = 1<>bdShift2; + + dst[y*stride+x] = Clip1_8bit(dst[y*stride+x] + sum); + } + } +} + +void transform_skip_rdpcm_h_8_fallback(uint8_t *dst, const int16_t *coeffs, int log2nT, ptrdiff_t stride) +{ + int bitDepth = 8; + int bdShift2 = 20-bitDepth; + int offset = (1<<(bdShift2-1)); + int tsShift = 5 + log2nT; // TODO: extended_precision + int nT = 1<>bdShift2; + + dst[y*stride+x] = Clip1_8bit(dst[y*stride+x] + sum); + } + } +} + + +void transform_bypass_rdpcm_v_8_fallback(uint8_t *dst, const int16_t *coeffs,int nT,ptrdiff_t stride) +{ + for (int x=0;x>bdShift; + residual[y*nT+x] = sum; + } + } +} + + +void rdpcm_h_fallback(int32_t* residual, const int16_t* coeffs, int nT,int tsShift,int bdShift) +{ + int rnd = (1<<(bdShift-1)); + + for (int y=0;y>bdShift; + residual[y*nT+x] = sum; + } + } +} + + +void transform_bypass_fallback(int32_t *dst, const int16_t *coeffs, int nT) +{ + for (int y=0;y "); + */ + + for (int i=0;i<4;i++) { + int sum=0; + + for (int j=0;j<4;j++) { + sum += mat_8_357[j][i] * coeffs[c+j*4]; + } + + g[i][c] = Clip3(-32768,32767, (sum+rndV)>>7); + } + + /* + for (int y=0;y<4;y++) { + logtrace(LogTransform,"*%d ",g[y][c]); + } + logtrace(LogTransform,"*\n"); + */ + } + + + // --- H --- + + for (int y=0;y<4;y++) { + + /* + logtrace(LogTransform,"DST-H: "); + for (int c=0;c<4;c++) { + logtrace(LogTransform,"%d ",g[y][c]); + } + logtrace(LogTransform,"* -> "); + */ + + for (int i=0;i<4;i++) { + int sum=0; + + for (int j=0;j<4;j++) { + sum += mat_8_357[j][i] * g[y][j]; + } + + int out = Clip3(-32768,32767, (sum+rndH)>>postShift); + + dst[y*stride+i] = Clip1_8bit(dst[y*stride+i] + out); + + logtrace(LogTransform,"*%d ",out); + } + + logtrace(LogTransform,"*\n"); + } +} + + +void transform_4x4_luma_add_16_fallback(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, + int bit_depth) +{ + int16_t g[4][4]; + + int postShift = 20-bit_depth; + int rndV = 1<<(7-1); + int rndH = 1<<(postShift-1); + + + // --- V --- + + for (int c=0;c<4;c++) { + /* + logtrace(LogTransform,"DST-V: "); + for (int r=0;r<4;r++) { + logtrace(LogTransform,"%d ",coeffs[c+r*4]); + } + logtrace(LogTransform,"* -> "); + */ + + for (int i=0;i<4;i++) { + int sum=0; + + for (int j=0;j<4;j++) { + sum += mat_8_357[j][i] * coeffs[c+j*4]; + } + + g[i][c] = Clip3(-32768,32767, (sum+rndV)>>7); + } + + /* + for (int y=0;y<4;y++) { + logtrace(LogTransform,"*%d ",g[y][c]); + } + logtrace(LogTransform,"*\n"); + */ + } + + + // --- H --- + + for (int y=0;y<4;y++) { + + /* + logtrace(LogTransform,"DST-H: "); + for (int c=0;c<4;c++) { + logtrace(LogTransform,"%d ",g[y][c]); + } + logtrace(LogTransform,"* -> "); + */ + + for (int i=0;i<4;i++) { + int sum=0; + + for (int j=0;j<4;j++) { + sum += mat_8_357[j][i] * g[y][j]; + } + + int out = Clip3(-32768,32767, (sum+rndH)>>postShift); + + dst[y*stride+i] = Clip_BitDepth(dst[y*stride+i] + out, bit_depth); + + logtrace(LogTransform,"*%d ",out); + } + + logtrace(LogTransform,"*\n"); + } +} + + +void fdst_4x4_8_fallback(int16_t *coeffs, const int16_t *input, ptrdiff_t stride) +{ + int16_t g[4*4]; + + int BD = 8; + int shift1 = Log2(4) + BD -9; + int shift2 = Log2(4) + 6; + + int rnd1 = 1<<(shift1-1); + int rnd2 = 1<<(shift2-1); + + + // --- V --- + + for (int c=0;c<4;c++) { + + /* + logtrace(LogTransform,"DST-V: "); + for (int r=0;r<4;r++) { + logtrace(LogTransform,"%d ",coeffs[c+r*4]); + } + logtrace(LogTransform,"* -> "); + */ + + for (int i=0;i<4;i++) { + int sum=0; + + for (int j=0;j<4;j++) { + sum += mat_8_357[i][j] * input[c+j*stride]; + } + + g[c+4*i] = Clip3(-32768,32767, (sum+rnd1)>>shift1); + } + } + + + // --- H --- + + for (int y=0;y<4;y++) { + for (int i=0;i<4;i++) { + int sum=0; + + for (int j=0;j<4;j++) { + sum += mat_8_357[i][j] * g[y*4+j]; + } + + // TODO: do we need clipping ? + int out = (sum+rnd2)>>shift2; // Clip3(-32768,32767, (sum+rndH)>>postShift); + + coeffs[y*4+i] = out; + + logtrace(LogTransform,"*%d ",out); + } + + logtrace(LogTransform,"*\n"); + } +} + + +void transform_idst_4x4_fallback(int32_t *dst, const int16_t *coeffs, int bdShift, int max_coeff_bits) +{ + int16_t g[4][4]; + + int rndV = 1<<(7-1); + int rndH = 1<<(bdShift-1); + + int CoeffMax = (1<>7); + } + } + + + // --- H --- + + for (int y=0;y<4;y++) { + for (int i=0;i<4;i++) { + int sum=0; + + for (int j=0;j<4;j++) { + sum += mat_8_357[j][i] * g[y][j]; + } + + dst[y*4+i] = (sum + rndH)>>bdShift; + } + } +} + + + +static int8_t mat_dct[32][32] = { + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64}, + { 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4, -4,-13,-22,-31,-38,-46,-54,-61,-67,-73,-78,-82,-85,-88,-90,-90}, + { 90, 87, 80, 70, 57, 43, 25, 9, -9,-25,-43,-57,-70,-80,-87,-90, -90,-87,-80,-70,-57,-43,-25, -9, 9, 25, 43, 57, 70, 80, 87, 90}, + { 90, 82, 67, 46, 22, -4,-31,-54,-73,-85,-90,-88,-78,-61,-38,-13, 13, 38, 61, 78, 88, 90, 85, 73, 54, 31, 4,-22,-46,-67,-82,-90}, + { 89, 75, 50, 18,-18,-50,-75,-89,-89,-75,-50,-18, 18, 50, 75, 89, 89, 75, 50, 18,-18,-50,-75,-89,-89,-75,-50,-18, 18, 50, 75, 89}, + { 88, 67, 31,-13,-54,-82,-90,-78,-46, -4, 38, 73, 90, 85, 61, 22, -22,-61,-85,-90,-73,-38, 4, 46, 78, 90, 82, 54, 13,-31,-67,-88}, + { 87, 57, 9,-43,-80,-90,-70,-25, 25, 70, 90, 80, 43, -9,-57,-87, -87,-57, -9, 43, 80, 90, 70, 25,-25,-70,-90,-80,-43, 9, 57, 87}, + { 85, 46,-13,-67,-90,-73,-22, 38, 82, 88, 54, -4,-61,-90,-78,-31, 31, 78, 90, 61, 4,-54,-88,-82,-38, 22, 73, 90, 67, 13,-46,-85}, + { 83, 36,-36,-83,-83,-36, 36, 83, 83, 36,-36,-83,-83,-36, 36, 83, 83, 36,-36,-83,-83,-36, 36, 83, 83, 36,-36,-83,-83,-36, 36, 83}, + { 82, 22,-54,-90,-61, 13, 78, 85, 31,-46,-90,-67, 4, 73, 88, 38, -38,-88,-73, -4, 67, 90, 46,-31,-85,-78,-13, 61, 90, 54,-22,-82}, + { 80, 9,-70,-87,-25, 57, 90, 43,-43,-90,-57, 25, 87, 70, -9,-80, -80, -9, 70, 87, 25,-57,-90,-43, 43, 90, 57,-25,-87,-70, 9, 80}, + { 78, -4,-82,-73, 13, 85, 67,-22,-88,-61, 31, 90, 54,-38,-90,-46, 46, 90, 38,-54,-90,-31, 61, 88, 22,-67,-85,-13, 73, 82, 4,-78}, + { 75,-18,-89,-50, 50, 89, 18,-75,-75, 18, 89, 50,-50,-89,-18, 75, 75,-18,-89,-50, 50, 89, 18,-75,-75, 18, 89, 50,-50,-89,-18, 75}, + { 73,-31,-90,-22, 78, 67,-38,-90,-13, 82, 61,-46,-88, -4, 85, 54, -54,-85, 4, 88, 46,-61,-82, 13, 90, 38,-67,-78, 22, 90, 31,-73}, + { 70,-43,-87, 9, 90, 25,-80,-57, 57, 80,-25,-90, -9, 87, 43,-70, -70, 43, 87, -9,-90,-25, 80, 57,-57,-80, 25, 90, 9,-87,-43, 70}, + { 67,-54,-78, 38, 85,-22,-90, 4, 90, 13,-88,-31, 82, 46,-73,-61, 61, 73,-46,-82, 31, 88,-13,-90, -4, 90, 22,-85,-38, 78, 54,-67}, + { 64,-64,-64, 64, 64,-64,-64, 64, 64,-64,-64, 64, 64,-64,-64, 64, 64,-64,-64, 64, 64,-64,-64, 64, 64,-64,-64, 64, 64,-64,-64, 64}, + { 61,-73,-46, 82, 31,-88,-13, 90, -4,-90, 22, 85,-38,-78, 54, 67, -67,-54, 78, 38,-85,-22, 90, 4,-90, 13, 88,-31,-82, 46, 73,-61}, + { 57,-80,-25, 90, -9,-87, 43, 70,-70,-43, 87, 9,-90, 25, 80,-57, -57, 80, 25,-90, 9, 87,-43,-70, 70, 43,-87, -9, 90,-25,-80, 57}, + { 54,-85, -4, 88,-46,-61, 82, 13,-90, 38, 67,-78,-22, 90,-31,-73, 73, 31,-90, 22, 78,-67,-38, 90,-13,-82, 61, 46,-88, 4, 85,-54}, + { 50,-89, 18, 75,-75,-18, 89,-50,-50, 89,-18,-75, 75, 18,-89, 50, 50,-89, 18, 75,-75,-18, 89,-50,-50, 89,-18,-75, 75, 18,-89, 50}, + { 46,-90, 38, 54,-90, 31, 61,-88, 22, 67,-85, 13, 73,-82, 4, 78, -78, -4, 82,-73,-13, 85,-67,-22, 88,-61,-31, 90,-54,-38, 90,-46}, + { 43,-90, 57, 25,-87, 70, 9,-80, 80, -9,-70, 87,-25,-57, 90,-43, -43, 90,-57,-25, 87,-70, -9, 80,-80, 9, 70,-87, 25, 57,-90, 43}, + { 38,-88, 73, -4,-67, 90,-46,-31, 85,-78, 13, 61,-90, 54, 22,-82, 82,-22,-54, 90,-61,-13, 78,-85, 31, 46,-90, 67, 4,-73, 88,-38}, + { 36,-83, 83,-36,-36, 83,-83, 36, 36,-83, 83,-36,-36, 83,-83, 36, 36,-83, 83,-36,-36, 83,-83, 36, 36,-83, 83,-36,-36, 83,-83, 36}, + { 31,-78, 90,-61, 4, 54,-88, 82,-38,-22, 73,-90, 67,-13,-46, 85, -85, 46, 13,-67, 90,-73, 22, 38,-82, 88,-54, -4, 61,-90, 78,-31}, + { 25,-70, 90,-80, 43, 9,-57, 87,-87, 57, -9,-43, 80,-90, 70,-25, -25, 70,-90, 80,-43, -9, 57,-87, 87,-57, 9, 43,-80, 90,-70, 25}, + { 22,-61, 85,-90, 73,-38, -4, 46,-78, 90,-82, 54,-13,-31, 67,-88, 88,-67, 31, 13,-54, 82,-90, 78,-46, 4, 38,-73, 90,-85, 61,-22}, + { 18,-50, 75,-89, 89,-75, 50,-18,-18, 50,-75, 89,-89, 75,-50, 18, 18,-50, 75,-89, 89,-75, 50,-18,-18, 50,-75, 89,-89, 75,-50, 18}, + { 13,-38, 61,-78, 88,-90, 85,-73, 54,-31, 4, 22,-46, 67,-82, 90, -90, 82,-67, 46,-22, -4, 31,-54, 73,-85, 90,-88, 78,-61, 38,-13}, + { 9,-25, 43,-57, 70,-80, 87,-90, 90,-87, 80,-70, 57,-43, 25, -9, -9, 25,-43, 57,-70, 80,-87, 90,-90, 87,-80, 70,-57, 43,-25, 9}, + { 4,-13, 22,-31, 38,-46, 54,-61, 67,-73, 78,-82, 85,-88, 90,-90, 90,-90, 88,-85, 82,-78, 73,-67, 61,-54, 46,-38, 31,-22, 13, -4} +}; + + + + +template +void transform_idct_add(pixel_t *dst, ptrdiff_t stride, + int nT, const int16_t *coeffs, int bit_depth) +{ + /* + The effective shift is + 7 bits right for bit-depth 8, + 6 bits right for bit-depth 9, + 5 bits right for bit-depth 10. + + Computation is independent of the block size. + Each multiplication with the table includes a left shift of 6 bits. + Hence, we have 2* 6 bits = 12 bits left shift. + V-pass has fixed 7 bit right shift. + H-pass has 20-BitDepth bit right shift; + + Effective shift 's' means: residual value 1 gives DC-coeff (1< "); + */ + + + // find last non-zero coefficient to reduce computations carried out in DCT + + int lastCol = nT-1; + for (;lastCol>=0;lastCol--) { + if (coeffs[c+lastCol*nT]) { break; } + } + + for (int i=0;i>7); + + logtrace(LogTransform,"*%d ",g[c+i*nT]); + } + logtrace(LogTransform,"*\n"); + } + + /* + printf("--- temp\n"); + for (int r=0;r "); + */ + + + // find last non-zero coefficient to reduce computations carried out in DCT + + int lastCol = nT-1; + for (;lastCol>=0;lastCol--) { + if (g[y*nT+lastCol]) { break; } + } + + + for (int i=0;i>postShift); + int out = (sum+rnd2)>>postShift; + + //fprintf(stderr,"%d*%d+%d = %d\n",y,stride,i,y*stride+i); + //fprintf(stderr,"[%p]=%d\n",&dst[y*stride+i], Clip1_8bit(dst[y*stride+i])); + dst[y*stride+i] = Clip_BitDepth(dst[y*stride+i] + out, bit_depth); + + logtrace(LogTransform,"*%d ",out); + } + logtrace(LogTransform,"*\n"); + } +} + + + +void transform_idct_fallback(int32_t *dst, int nT, const int16_t *coeffs, int bdShift, int max_coeff_bits) +{ + /* + The effective shift is + 7 bits right for bit-depth 8, + 6 bits right for bit-depth 9, + 5 bits right for bit-depth 10. + + One transformation with raw transform filter values increases range be 2048 (=32*64). + This equals 11 bits. + + Computation is independent of the block size. + Each multiplication with the table includes a left shift of 6 bits. + Hence, we have 2* 6 bits = 12 bits left shift. + V-pass has fixed 7 bit right shift. + H-pass has 20-BitDepth bit right shift; + + Effective shift 's' means: residual value 1 gives DC-coeff (1< "); + */ + + + // find last non-zero coefficient to reduce computations carried out in DCT + + int lastCol = nT-1; + for (;lastCol>=0;lastCol--) { + if (coeffs[c+lastCol*nT]) { break; } + } + + for (int i=0;i>7); + + logtrace(LogTransform,"*%d ",g[c+i*nT]); + } + logtrace(LogTransform,"*\n"); + } + + /* + printf("--- temp\n"); + for (int r=0;r "); + */ + + + // find last non-zero coefficient to reduce computations carried out in DCT + + int lastCol = nT-1; + for (;lastCol>=0;lastCol--) { + if (g[y*nT+lastCol]) { break; } + } + + + for (int i=0;i>bdShift; + + logtrace(LogTransform,"*%d ",sum); + } + logtrace(LogTransform,"*\n"); + } +} + + +void transform_idct_4x4_fallback(int32_t *dst, const int16_t *coeffs, int bdShift, int max_coeff_bits) +{ + transform_idct_fallback(dst,4,coeffs,bdShift,max_coeff_bits); +} + +void transform_idct_8x8_fallback(int32_t *dst, const int16_t *coeffs, int bdShift, int max_coeff_bits) +{ + transform_idct_fallback(dst,8,coeffs,bdShift,max_coeff_bits); +} + +void transform_idct_16x16_fallback(int32_t *dst, const int16_t *coeffs, + int bdShift, int max_coeff_bits) +{ + transform_idct_fallback(dst,16,coeffs,bdShift,max_coeff_bits); +} + +void transform_idct_32x32_fallback(int32_t *dst, const int16_t *coeffs, + int bdShift, int max_coeff_bits) +{ + transform_idct_fallback(dst,32,coeffs,bdShift,max_coeff_bits); +} + + + + +void transform_4x4_add_8_fallback(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride) +{ + transform_idct_add(dst,stride, 4, coeffs, 8); +} + +void transform_8x8_add_8_fallback(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride) +{ + transform_idct_add(dst,stride, 8, coeffs, 8); +} + +void transform_16x16_add_8_fallback(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride) +{ + transform_idct_add(dst,stride, 16, coeffs, 8); +} + +void transform_32x32_add_8_fallback(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride) +{ + transform_idct_add(dst,stride, 32, coeffs, 8); +} + + +void transform_4x4_add_16_fallback(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth) +{ + transform_idct_add(dst,stride, 4, coeffs, bit_depth); +} + +void transform_8x8_add_16_fallback(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth) +{ + transform_idct_add(dst,stride, 8, coeffs, bit_depth); +} + +void transform_16x16_add_16_fallback(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth) +{ + transform_idct_add(dst,stride, 16, coeffs, bit_depth); +} + +void transform_32x32_add_16_fallback(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth) +{ + transform_idct_add(dst,stride, 32, coeffs, bit_depth); +} + + +static void transform_fdct_8(int16_t* coeffs, int nT, + const int16_t *input, ptrdiff_t stride) +{ + /* + Each sum over a basis vector sums nT elements, which is compensated by + shifting right by Log2(nT), effectively dividing by 2^Log2(nT) = nT. + Do this in each of the H/V passes. + + Each multiplication with the table includes a left shift of 6 bits. + Hence, we have in total 2* 6 bits = 12 bits left shift because of the + multiplications. + + We carry out shifts after each pass: + First (V) pass has BitDepth-9 bit right shift, + Second (H) pass has fixed 6 bit right shift. + + For bit-depth 8, the total shift is 7 bits left. + For bit-depth 9, the total shift is 6 bits left. + For bit-depth 10, the total shift is 5 bits left. + + I.e.: a constant residual value 1 gives DC-coeff (1< 4 bits and we are down to 16 bits again. + After the second pass, we need 16+5+6=27 bits for the intermediate sum + (16 bit input, 5 bit because we sum 2^5 elements, 6 bit because of coefficient multiplication). + The second pass shift is Log2(32)+6 = 11 and we are down again to 16 bits. + + For larger input bit-depths, the intermediate result after the first pass + will be wider accordingly, but the widths after the shifts are the same. + */ + + int BitDepth = 8; + + // / compensate everything | / effective word length | + int shift1 = Log2(nT) + 6 + BitDepth - 15; + int shift2 = Log2(nT) + 6; + + int rnd1 = 1<<(shift1-1); + int rnd2 = 1<<(shift2-1); + int fact = (1<<(5-Log2(nT))); + + int16_t g[32*32]; // actually, only [nT*nT] used + + for (int c=0;c>shift1; // clipping to -32768;32767 unnecessary + } + } + + + for (int y=0;y>shift2; + + coeffs[y*nT+i] = out; + } + } +} + + +void fdct_4x4_8_fallback(int16_t *coeffs, const int16_t *input, ptrdiff_t stride) +{ + transform_fdct_8(coeffs, 4, input,stride); +} + +void fdct_8x8_8_fallback(int16_t *coeffs, const int16_t *input, ptrdiff_t stride) +{ + transform_fdct_8(coeffs, 8, input,stride); +} + +void fdct_16x16_8_fallback(int16_t *coeffs, const int16_t *input, ptrdiff_t stride) +{ + transform_fdct_8(coeffs, 16, input,stride); +} + +void fdct_32x32_8_fallback(int16_t *coeffs, const int16_t *input, ptrdiff_t stride) +{ + transform_fdct_8(coeffs, 32, input,stride); +} + + + + +void hadamard_transform_8(int16_t *coeffs, int n, const int16_t *input, ptrdiff_t stride) +{ + int16_t tmp[32*32]; + + // row transforms + + //printMatrix("input",input,n); + + int16_t am[32],bm[32]; + int16_t *a = am, *b = bm; + for (int row=0;row>1);i++) { + a[ i] = input[i+rs] + input[i+(n>>1)+rs]; + a[(n>>1)+i] = input[i+rs] - input[i+(n>>1)+rs]; + } + + int iOuter=(n>>1); + int nInner=(n>>2); + + while (nInner>=2) { + std::swap(a,b); + + for (int k=0;k>=1; + nInner>>=1; + } + + for (int k=0;k>1);i++) { + a[ i] = tmp[i*n+col] + tmp[(i+(n>>1))*n+col]; + a[(n>>1)+i] = tmp[i*n+col] - tmp[(i+(n>>1))*n+col]; + } + + int iOuter=(n>>1); + int nInner=(n>>2); + + while (nInner>=2) { + std::swap(a,b); + + for (int k=0;k>=1; + nInner>>=1; + } + + for (int k=0;k + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "fallback-motion.h" +#include "util.h" + +#if defined(_MSC_VER) || defined(__MINGW32__) +# include +#elif defined(HAVE_ALLOCA_H) +# include +#endif + +#include + + +void put_unweighted_pred_8_fallback(uint8_t *dst, ptrdiff_t dststride, + const int16_t *src, ptrdiff_t srcstride, + int width, int height) +{ + int offset8bit = 32; + int shift8bit = 6; + + assert((width&1)==0); + + for (int y=0;y>shift8bit); + out[1] = Clip1_8bit((in[1] + offset8bit)>>shift8bit); + out+=2; in+=2; + } + } +} + + +void put_weighted_pred_8_fallback(uint8_t *dst, ptrdiff_t dststride, + const int16_t *src, ptrdiff_t srcstride, + int width, int height, + int w,int o,int log2WD) +{ + assert(log2WD>=1); // TODO + + const int rnd = (1<<(log2WD-1)); + + for (int y=0;y>log2WD) + o); + out++; in++; + } + } +} + +void put_weighted_bipred_8_fallback(uint8_t *dst, ptrdiff_t dststride, + const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride, + int width, int height, + int w1,int o1, int w2,int o2, int log2WD) +{ + assert(log2WD>=1); // TODO + + const int rnd = ((o1+o2+1) << log2WD); + + for (int y=0;y>(log2WD+1)); + out++; in1++; in2++; + } + } +} + + +void put_weighted_pred_avg_8_fallback(uint8_t *dst, ptrdiff_t dststride, + const int16_t *src1, const int16_t *src2, + ptrdiff_t srcstride, int width, + int height) +{ + int offset8bit = 64; + int shift8bit = 7; + + assert((width&1)==0); + + // I had a special case for 8-pixel parallel, unrolled code, + // but I did not see any speedup. + +#if 0 + for (int y=0;y>shift8bit); + out++; in1++; in2++; + } + } +#endif + +#if 0 + if ((width&7)==0) { + for (int y=0;y>shift8bit); + out[1] = Clip1_8bit((in1[1] + in2[1] + offset8bit)>>shift8bit); + out[2] = Clip1_8bit((in1[2] + in2[2] + offset8bit)>>shift8bit); + out[3] = Clip1_8bit((in1[3] + in2[3] + offset8bit)>>shift8bit); + out[4] = Clip1_8bit((in1[4] + in2[4] + offset8bit)>>shift8bit); + out[5] = Clip1_8bit((in1[5] + in2[5] + offset8bit)>>shift8bit); + out[6] = Clip1_8bit((in1[6] + in2[6] + offset8bit)>>shift8bit); + out[7] = Clip1_8bit((in1[7] + in2[7] + offset8bit)>>shift8bit); + out+=8; in1+=8; in2+=8; + } + } + } + else +#endif + { + for (int y=0;y>shift8bit); + out[1] = Clip1_8bit((in1[1] + in2[1] + offset8bit)>>shift8bit); + out+=2; in1+=2; in2+=2; + } + } + } +} + + + + + +void put_unweighted_pred_16_fallback(uint16_t *dst, ptrdiff_t dststride, + const int16_t *src, ptrdiff_t srcstride, + int width, int height, int bit_depth) +{ + int shift1 = 14-bit_depth; + int offset1 = 0; + if (shift1>0) { offset1 = 1<<(shift1-1); } + + assert((width&1)==0); + + for (int y=0;y>shift1, bit_depth); + out[1] = Clip_BitDepth((in[1] + offset1)>>shift1, bit_depth); + out+=2; in+=2; + } + } +} + +#include + +void put_weighted_pred_16_fallback(uint16_t *dst, ptrdiff_t dststride, + const int16_t *src, ptrdiff_t srcstride, + int width, int height, + int w,int o,int log2WD, int bit_depth) +{ + assert(log2WD>=1); // TODO + + const int rnd = (1<<(log2WD-1)); + + for (int y=0;y>log2WD) + o, bit_depth); + out++; in++; + } + } +} + +void put_weighted_bipred_16_fallback(uint16_t *dst, ptrdiff_t dststride, + const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride, + int width, int height, + int w1,int o1, int w2,int o2, int log2WD, int bit_depth) +{ + assert(log2WD>=1); // TODO + + const int rnd = ((o1+o2+1) << log2WD); + + for (int y=0;y>(log2WD+1), bit_depth); + out++; in1++; in2++; + } + } +} + + +void put_weighted_pred_avg_16_fallback(uint16_t *dst, ptrdiff_t dststride, + const int16_t *src1, const int16_t *src2, + ptrdiff_t srcstride, int width, + int height, int bit_depth) +{ + int shift2 = 15-bit_depth; + int offset2 = 1<<(shift2-1); + + assert((width&1)==0); + + for (int y=0;y>shift2, bit_depth); + out[1] = Clip_BitDepth((in1[1] + in2[1] + offset2)>>shift2, bit_depth); + out+=2; in1+=2; in2+=2; + } + } +} + + + + + +void put_epel_8_fallback(int16_t *out, ptrdiff_t out_stride, + const uint8_t *src, ptrdiff_t src_stride, + int width, int height, + int mx, int my, int16_t* mcbuffer) +{ + int shift3 = 6; + + for (int y=0;y +void put_epel_hv_fallback(int16_t *dst, ptrdiff_t dst_stride, + const pixel_t *src, ptrdiff_t src_stride, + int nPbWC, int nPbHC, + int xFracC, int yFracC, int16_t* mcbuffer, int bit_depth) +{ + const int shift1 = bit_depth-8; + const int shift2 = 6; + //const int shift3 = 6; + + int extra_left = 1; + int extra_top = 1; + // int extra_right = 2; + int extra_bottom= 2; + + + int nPbH_extra = extra_top + nPbHC + extra_bottom; + + int16_t* tmp2buf = (int16_t*)alloca( nPbWC * nPbH_extra * sizeof(int16_t) ); + + /* + int nPbW_extra = extra_left + nPbWC + extra_right; + + + printf("x,y FracC: %d/%d\n",xFracC,yFracC); + + printf("---IN---\n"); + + for (int y=-extra_top;y>shift1; break; + case 2: v = (-4*p[0]+54*p[1]+16*p[2]-2*p[3])>>shift1; break; + case 3: v = (-6*p[0]+46*p[1]+28*p[2]-4*p[3])>>shift1; break; + case 4: v = (-4*p[0]+36*p[1]+36*p[2]-4*p[3])>>shift1; break; + case 5: v = (-4*p[0]+28*p[1]+46*p[2]-6*p[3])>>shift1; break; + case 6: v = (-2*p[0]+16*p[1]+54*p[2]-4*p[3])>>shift1; break; + default: + case 7: v = (-2*p[0]+10*p[1]+58*p[2]-2*p[3])>>shift1; break; + } + + //printf("%d %d %d %d -> %d\n",p[0],p[1],p[2],p[3],v); + + tmp2buf[y+extra_top + x*nPbH_extra] = v; + p++; + + //printf("%05d ",tmp2buf[y+extra_top + x*nPbH_extra]); + } + //printf("\n"); + } + + // V-filters + + int vshift = (xFracC==0 ? shift1 : shift2); + + for (int x=0;x>vshift; break; + case 2: v = (-4*p[0]+54*p[1]+16*p[2]-2*p[3])>>vshift; break; + case 3: v = (-6*p[0]+46*p[1]+28*p[2]-4*p[3])>>vshift; break; + case 4: v = (-4*p[0]+36*p[1]+36*p[2]-4*p[3])>>vshift; break; + case 5: v = (-4*p[0]+28*p[1]+46*p[2]-6*p[3])>>vshift; break; + case 6: v = (-2*p[0]+16*p[1]+54*p[2]-4*p[3])>>vshift; break; + default: + case 7: v = (-2*p[0]+10*p[1]+58*p[2]-2*p[3])>>vshift; break; + } + + dst[x + y*dst_stride] = v; + p++; + } + + } + + /* + printf("---V---\n"); + for (int y=0;y(int16_t *dst, ptrdiff_t dst_stride, + const uint8_t *src, ptrdiff_t src_stride, + int nPbWC, int nPbHC, + int xFracC, int yFracC, int16_t* mcbuffer, int bit_depth); +template +void put_epel_hv_fallback(int16_t *dst, ptrdiff_t dst_stride, + const uint16_t *src, ptrdiff_t src_stride, + int nPbWC, int nPbHC, + int xFracC, int yFracC, int16_t* mcbuffer, int bit_depth); + + + +void put_qpel_0_0_fallback(int16_t *out, ptrdiff_t out_stride, + const uint8_t *src, ptrdiff_t srcstride, + int nPbW, int nPbH, int16_t* mcbuffer) +{ + //const int shift1 = 0; // sps->BitDepth_Y-8; + const int shift2 = 6; + + // straight copy + + for (int y=0;y +void put_qpel_fallback(int16_t *out, ptrdiff_t out_stride, + const pixel_t *src, ptrdiff_t srcstride, + int nPbW, int nPbH, int16_t* mcbuffer, + int xFracL, int yFracL, int bit_depth) +{ + int extra_left = extra_before[xFracL]; + //int extra_right = extra_after [xFracL]; + int extra_top = extra_before[yFracL]; + int extra_bottom = extra_after [yFracL]; + + //int nPbW_extra = extra_left + nPbW + extra_right; + int nPbH_extra = extra_top + nPbH + extra_bottom; + + const int shift1 = bit_depth-8; + const int shift2 = 6; + + + // H-filters + + switch (xFracL) { + case 0: + for (int y=-extra_top;y>shift1; + o += nPbH_extra; + p++; + } + } + break; + case 2: + for (int y=-extra_top;y>shift1; + o += nPbH_extra; + p++; + } + } + break; + case 3: + for (int y=-extra_top;y>shift1; + o += nPbH_extra; + p++; + } + } + break; + } + + + logtrace(LogMotion,"---H---\n"); + + for (int y=-extra_top;y>vshift; + o+=out_stride; + p++; + } + } + break; + case 2: + for (int x=0;x>vshift; + o+=out_stride; + p++; + } + } + break; + case 3: + for (int x=0;x>vshift; + o+=out_stride; + p++; + } + } + break; + } + + + logtrace(LogMotion,"---V---\n"); + for (int y=0;y + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "fallback.h" +#include "fallback-motion.h" +#include "fallback-dct.h" + + +void init_acceleration_functions_fallback(struct acceleration_functions* accel) +{ + accel->put_weighted_pred_avg_8 = put_weighted_pred_avg_8_fallback; + accel->put_unweighted_pred_8 = put_unweighted_pred_8_fallback; + accel->put_weighted_pred_8 = put_weighted_pred_8_fallback; + accel->put_weighted_bipred_8 = put_weighted_bipred_8_fallback; + + accel->put_weighted_pred_avg_16 = put_weighted_pred_avg_16_fallback; + accel->put_unweighted_pred_16 = put_unweighted_pred_16_fallback; + accel->put_weighted_pred_16 = put_weighted_pred_16_fallback; + accel->put_weighted_bipred_16 = put_weighted_bipred_16_fallback; + + + accel->put_hevc_epel_8 = put_epel_8_fallback; + accel->put_hevc_epel_h_8 = put_epel_hv_fallback; + accel->put_hevc_epel_v_8 = put_epel_hv_fallback; + accel->put_hevc_epel_hv_8 = put_epel_hv_fallback; + + accel->put_hevc_qpel_8[0][0] = put_qpel_0_0_fallback; + accel->put_hevc_qpel_8[0][1] = put_qpel_0_1_fallback; + accel->put_hevc_qpel_8[0][2] = put_qpel_0_2_fallback; + accel->put_hevc_qpel_8[0][3] = put_qpel_0_3_fallback; + accel->put_hevc_qpel_8[1][0] = put_qpel_1_0_fallback; + accel->put_hevc_qpel_8[1][1] = put_qpel_1_1_fallback; + accel->put_hevc_qpel_8[1][2] = put_qpel_1_2_fallback; + accel->put_hevc_qpel_8[1][3] = put_qpel_1_3_fallback; + accel->put_hevc_qpel_8[2][0] = put_qpel_2_0_fallback; + accel->put_hevc_qpel_8[2][1] = put_qpel_2_1_fallback; + accel->put_hevc_qpel_8[2][2] = put_qpel_2_2_fallback; + accel->put_hevc_qpel_8[2][3] = put_qpel_2_3_fallback; + accel->put_hevc_qpel_8[3][0] = put_qpel_3_0_fallback; + accel->put_hevc_qpel_8[3][1] = put_qpel_3_1_fallback; + accel->put_hevc_qpel_8[3][2] = put_qpel_3_2_fallback; + accel->put_hevc_qpel_8[3][3] = put_qpel_3_3_fallback; + + accel->put_hevc_epel_16 = put_epel_16_fallback; + accel->put_hevc_epel_h_16 = put_epel_hv_fallback; + accel->put_hevc_epel_v_16 = put_epel_hv_fallback; + accel->put_hevc_epel_hv_16 = put_epel_hv_fallback; + + accel->put_hevc_qpel_16[0][0] = put_qpel_0_0_fallback_16; + accel->put_hevc_qpel_16[0][1] = put_qpel_0_1_fallback_16; + accel->put_hevc_qpel_16[0][2] = put_qpel_0_2_fallback_16; + accel->put_hevc_qpel_16[0][3] = put_qpel_0_3_fallback_16; + accel->put_hevc_qpel_16[1][0] = put_qpel_1_0_fallback_16; + accel->put_hevc_qpel_16[1][1] = put_qpel_1_1_fallback_16; + accel->put_hevc_qpel_16[1][2] = put_qpel_1_2_fallback_16; + accel->put_hevc_qpel_16[1][3] = put_qpel_1_3_fallback_16; + accel->put_hevc_qpel_16[2][0] = put_qpel_2_0_fallback_16; + accel->put_hevc_qpel_16[2][1] = put_qpel_2_1_fallback_16; + accel->put_hevc_qpel_16[2][2] = put_qpel_2_2_fallback_16; + accel->put_hevc_qpel_16[2][3] = put_qpel_2_3_fallback_16; + accel->put_hevc_qpel_16[3][0] = put_qpel_3_0_fallback_16; + accel->put_hevc_qpel_16[3][1] = put_qpel_3_1_fallback_16; + accel->put_hevc_qpel_16[3][2] = put_qpel_3_2_fallback_16; + accel->put_hevc_qpel_16[3][3] = put_qpel_3_3_fallback_16; + + + + accel->transform_skip_8 = transform_skip_8_fallback; + accel->transform_skip_rdpcm_h_8 = transform_skip_rdpcm_h_8_fallback; + accel->transform_skip_rdpcm_v_8 = transform_skip_rdpcm_v_8_fallback; + accel->transform_bypass = transform_bypass_fallback; + accel->transform_bypass_rdpcm_h = transform_bypass_rdpcm_h_fallback; + accel->transform_bypass_rdpcm_v = transform_bypass_rdpcm_v_fallback; + accel->transform_4x4_dst_add_8 = transform_4x4_luma_add_8_fallback; + accel->transform_add_8[0] = transform_4x4_add_8_fallback; + accel->transform_add_8[1] = transform_8x8_add_8_fallback; + accel->transform_add_8[2] = transform_16x16_add_8_fallback; + accel->transform_add_8[3] = transform_32x32_add_8_fallback; + + accel->transform_skip_16 = transform_skip_16_fallback; + accel->transform_4x4_dst_add_16 = transform_4x4_luma_add_16_fallback; + accel->transform_add_16[0] = transform_4x4_add_16_fallback; + accel->transform_add_16[1] = transform_8x8_add_16_fallback; + accel->transform_add_16[2] = transform_16x16_add_16_fallback; + accel->transform_add_16[3] = transform_32x32_add_16_fallback; + + accel->rotate_coefficients = rotate_coefficients_fallback; + accel->add_residual_8 = add_residual_fallback; + accel->add_residual_16 = add_residual_fallback; + accel->rdpcm_h = rdpcm_h_fallback; + accel->rdpcm_v = rdpcm_v_fallback; + accel->transform_skip_residual = transform_skip_residual_fallback; + + accel->transform_idst_4x4 = transform_idst_4x4_fallback; + accel->transform_idct_4x4 = transform_idct_4x4_fallback; + accel->transform_idct_8x8 = transform_idct_8x8_fallback; + accel->transform_idct_16x16 = transform_idct_16x16_fallback; + accel->transform_idct_32x32 = transform_idct_32x32_fallback; + + accel->fwd_transform_4x4_dst_8 = fdst_4x4_8_fallback; + accel->fwd_transform_8[0] = fdct_4x4_8_fallback; + accel->fwd_transform_8[1] = fdct_8x8_8_fallback; + accel->fwd_transform_8[2] = fdct_16x16_8_fallback; + accel->fwd_transform_8[3] = fdct_32x32_8_fallback; + + accel->hadamard_transform_8[0] = hadamard_4x4_8_fallback; + accel->hadamard_transform_8[1] = hadamard_8x8_8_fallback; + accel->hadamard_transform_8[2] = hadamard_16x16_8_fallback; + accel->hadamard_transform_8[3] = hadamard_32x32_8_fallback; +} diff --git a/image-io.cc b/image-io.cc new file mode 100644 index 0000000..60f4e6c --- /dev/null +++ b/image-io.cc @@ -0,0 +1,220 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * Authors: struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "libde265/image-io.h" +#include + + + +ImageSource::ImageSource() +{ +} + + +ImageSource_YUV::ImageSource_YUV() + : mFH(NULL) +{ +} + + +ImageSource_YUV::~ImageSource_YUV() +{ + if (mFH) { + fclose(mFH); + } +} + + +bool ImageSource_YUV::set_input_file(const char* filename, int w,int h) +{ + assert(mFH==NULL); + + mFH = fopen(filename,"rb"); + if (mFH==NULL) { + return false; + } + + width =w; + height=h; + mReachedEndOfFile = false; + + return true; +} + + +de265_image* ImageSource_YUV::read_next_image() +{ + if (mReachedEndOfFile) return NULL; + + de265_image* img = new de265_image; + img->alloc_image(width,height,de265_chroma_420, NULL, false, + NULL, /*NULL,*/ 0, NULL, false); + assert(img); // TODO: error handling + + // --- load image --- + + uint8_t* p; + int stride; + + p = img->get_image_plane(0); stride = img->get_image_stride(0); + for (int y=0;yget_image_plane(1); stride = img->get_image_stride(1); + for (int y=0;yget_image_plane(2); stride = img->get_image_stride(2); + for (int y=0;yget_width(); + int height= img->get_height(); + + p = img->get_image_plane(0); stride = img->get_image_stride(0); + for (int y=0;yget_image_plane(1); stride = img->get_image_stride(1); + for (int y=0;yget_image_plane(2); stride = img->get_image_stride(2); + for (int y=0;y + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "image.h" +#include "decctx.h" + +#include + +#include +#include +#include + +#include + + +#ifdef HAVE_MALLOC_H +#include +#endif + +#ifdef HAVE_SSE4_1 +// SSE code processes 128bit per iteration and thus might read more data +// than is later actually used. +#define MEMORY_PADDING 16 +#else +#define MEMORY_PADDING 0 +#endif + +#define STANDARD_ALIGNMENT 16 + +#ifdef HAVE___MINGW_ALIGNED_MALLOC +#define ALLOC_ALIGNED(alignment, size) __mingw_aligned_malloc((size), (alignment)) +#define FREE_ALIGNED(mem) __mingw_aligned_free((mem)) +#elif _WIN32 +#define ALLOC_ALIGNED(alignment, size) _aligned_malloc((size), (alignment)) +#define FREE_ALIGNED(mem) _aligned_free((mem)) +#elif defined(HAVE_POSIX_MEMALIGN) +static inline void *ALLOC_ALIGNED(size_t alignment, size_t size) { + void *mem = NULL; + if (posix_memalign(&mem, alignment, size) != 0) { + return NULL; + } + return mem; +}; +#define FREE_ALIGNED(mem) free((mem)) +#else +#define ALLOC_ALIGNED(alignment, size) memalign((alignment), (size)) +#define FREE_ALIGNED(mem) free((mem)) +#endif + +#define ALLOC_ALIGNED_16(size) ALLOC_ALIGNED(16, size) + +static const int alignment = 16; + +LIBDE265_API void* de265_alloc_image_plane(struct de265_image* img, int cIdx, + void* inputdata, int inputstride, void *userdata) +{ + int alignment = STANDARD_ALIGNMENT; + int stride = (img->get_width(cIdx) + alignment-1) / alignment * alignment; + int height = img->get_height(cIdx); + + uint8_t* p = (uint8_t *)ALLOC_ALIGNED_16(stride * height + MEMORY_PADDING); + + if (p==NULL) { return NULL; } + + img->set_image_plane(cIdx, p, stride, userdata); + + // copy input data if provided + + if (inputdata != NULL) { + if (inputstride == stride) { + memcpy(p, inputdata, stride*height); + } + else { + for (int y=0;yget_image_plane(cIdx); + assert(p); + FREE_ALIGNED(p); +} + + +static int de265_image_get_buffer(de265_decoder_context* ctx, + de265_image_spec* spec, de265_image* img, void* userdata) +{ + const int rawChromaWidth = spec->width / img->SubWidthC; + const int rawChromaHeight = spec->height / img->SubHeightC; + + int luma_stride = (spec->width + spec->alignment-1) / spec->alignment * spec->alignment; + int chroma_stride = (rawChromaWidth + spec->alignment-1) / spec->alignment * spec->alignment; + + assert(img->BitDepth_Y >= 8 && img->BitDepth_Y <= 16); + assert(img->BitDepth_C >= 8 && img->BitDepth_C <= 16); + + int luma_bpl = luma_stride * ((img->BitDepth_Y+7)/8); + int chroma_bpl = chroma_stride * ((img->BitDepth_C+7)/8); + + int luma_height = spec->height; + int chroma_height = rawChromaHeight; + + bool alloc_failed = false; + + uint8_t* p[3] = { 0,0,0 }; + p[0] = (uint8_t *)ALLOC_ALIGNED_16(luma_height * luma_bpl + MEMORY_PADDING); + if (p[0]==NULL) { alloc_failed=true; } + + if (img->get_chroma_format() != de265_chroma_mono) { + p[1] = (uint8_t *)ALLOC_ALIGNED_16(chroma_height * chroma_bpl + MEMORY_PADDING); + p[2] = (uint8_t *)ALLOC_ALIGNED_16(chroma_height * chroma_bpl + MEMORY_PADDING); + + if (p[1]==NULL || p[2]==NULL) { alloc_failed=true; } + } + else { + p[1] = NULL; + p[2] = NULL; + chroma_stride = 0; + } + + if (alloc_failed) { + for (int i=0;i<3;i++) + if (p[i]) { + FREE_ALIGNED(p[i]); + } + + return 0; + } + + img->set_image_plane(0, p[0], luma_stride, NULL); + img->set_image_plane(1, p[1], chroma_stride, NULL); + img->set_image_plane(2, p[2], chroma_stride, NULL); + + return 1; +} + +static void de265_image_release_buffer(de265_decoder_context* ctx, + de265_image* img, void* userdata) +{ + for (int i=0;i<3;i++) { + uint8_t* p = (uint8_t*)img->get_image_plane(i); + if (p) { + FREE_ALIGNED(p); + } + } +} + + +de265_image_allocation de265_image::default_image_allocation = { + de265_image_get_buffer, + de265_image_release_buffer +}; + + +void de265_image::set_image_plane(int cIdx, uint8_t* mem, int stride, void *userdata) +{ + pixels[cIdx] = mem; + plane_user_data[cIdx] = userdata; + + if (cIdx==0) { this->stride = stride; } + else { this->chroma_stride = stride; } +} + + +uint32_t de265_image::s_next_image_ID = 0; + +de265_image::de265_image() +{ + ID = -1; + removed_at_picture_id = 0; // picture not used, so we can assume it has been removed + + decctx = NULL; + //encctx = NULL; + + //encoder_image_release_func = NULL; + + //alloc_functions.get_buffer = NULL; + //alloc_functions.release_buffer = NULL; + + for (int c=0;c<3;c++) { + pixels[c] = NULL; + pixels_confwin[c] = NULL; + plane_user_data[c] = NULL; + } + + width=height=0; + + pts = 0; + user_data = NULL; + + ctb_progress = NULL; + + integrity = INTEGRITY_NOT_DECODED; + + picture_order_cnt_lsb = -1; // undefined + PicOrderCntVal = -1; // undefined + PicState = UnusedForReference; + PicOutputFlag = false; + + nThreadsQueued = 0; + nThreadsRunning = 0; + nThreadsBlocked = 0; + nThreadsFinished = 0; + nThreadsTotal = 0; + + de265_mutex_init(&mutex); + de265_cond_init(&finished_cond); +} + + +de265_error de265_image::alloc_image(int w,int h, enum de265_chroma c, + std::shared_ptr sps, bool allocMetadata, + decoder_context* dctx, + //encoder_context* ectx, + de265_PTS pts, void* user_data, + bool useCustomAllocFunc) +{ + //if (allocMetadata) { assert(sps); } + if (allocMetadata) { assert(sps); } + + if (sps) { this->sps = sps; } + + release(); /* TODO: review code for efficient allocation when arrays are already + allocated to the requested size. Without the release, the old image-data + will not be freed. */ + + ID = s_next_image_ID++; + removed_at_picture_id = std::numeric_limits::max(); + + decctx = dctx; + //encctx = ectx; + + // --- allocate image buffer --- + + chroma_format= c; + + width = w; + height = h; + chroma_width = w; + chroma_height= h; + + this->user_data = user_data; + this->pts = pts; + + de265_image_spec spec; + + int WinUnitX, WinUnitY; + + switch (chroma_format) { + case de265_chroma_mono: WinUnitX=1; WinUnitY=1; break; + case de265_chroma_420: WinUnitX=2; WinUnitY=2; break; + case de265_chroma_422: WinUnitX=2; WinUnitY=1; break; + case de265_chroma_444: WinUnitX=1; WinUnitY=1; break; + default: + assert(0); + } + + switch (chroma_format) { + case de265_chroma_420: + spec.format = de265_image_format_YUV420P8; + chroma_width = (chroma_width +1)/2; + chroma_height = (chroma_height+1)/2; + SubWidthC = 2; + SubHeightC = 2; + break; + + case de265_chroma_422: + spec.format = de265_image_format_YUV422P8; + chroma_width = (chroma_width+1)/2; + SubWidthC = 2; + SubHeightC = 1; + break; + + case de265_chroma_444: + spec.format = de265_image_format_YUV444P8; + SubWidthC = 1; + SubHeightC = 1; + break; + + case de265_chroma_mono: + spec.format = de265_image_format_mono8; + chroma_width = 0; + chroma_height= 0; + SubWidthC = 1; + SubHeightC = 1; + break; + + default: + assert(false); + break; + } + + if (chroma_format != de265_chroma_mono && sps) { + assert(sps->SubWidthC == SubWidthC); + assert(sps->SubHeightC == SubHeightC); + } + + spec.width = w; + spec.height = h; + spec.alignment = STANDARD_ALIGNMENT; + + + // conformance window cropping + + int left = sps ? sps->conf_win_left_offset : 0; + int right = sps ? sps->conf_win_right_offset : 0; + int top = sps ? sps->conf_win_top_offset : 0; + int bottom = sps ? sps->conf_win_bottom_offset : 0; + + width_confwin = width - (left+right)*WinUnitX; + height_confwin= height- (top+bottom)*WinUnitY; + chroma_width_confwin = chroma_width -left-right; + chroma_height_confwin= chroma_height-top-bottom; + + spec.crop_left = left *WinUnitX; + spec.crop_right = right*WinUnitX; + spec.crop_top = top *WinUnitY; + spec.crop_bottom= bottom*WinUnitY; + + spec.visible_width = width_confwin; + spec.visible_height= height_confwin; + + + BitDepth_Y = (sps==NULL) ? 8 : sps->BitDepth_Y; + BitDepth_C = (sps==NULL) ? 8 : sps->BitDepth_C; + + bpp_shift[0] = (BitDepth_Y <= 8) ? 0 : 1; + bpp_shift[1] = (BitDepth_C <= 8) ? 0 : 1; + bpp_shift[2] = bpp_shift[1]; + + + // allocate memory and set conformance window pointers + + void* alloc_userdata = NULL; + if (decctx) alloc_userdata = decctx->param_image_allocation_userdata; + // if (encctx) alloc_userdata = encctx->param_image_allocation_userdata; // actually not needed + + /* + if (encctx && useCustomAllocFunc) { + encoder_image_release_func = encctx->release_func; + + // if we do not provide a release function, use our own + + if (encoder_image_release_func == NULL) { + image_allocation_functions = de265_image::default_image_allocation; + } + else { + image_allocation_functions.get_buffer = NULL; + image_allocation_functions.release_buffer = NULL; + } + } + else*/ if (decctx && useCustomAllocFunc) { + image_allocation_functions = decctx->param_image_allocation_functions; + } + else { + image_allocation_functions = de265_image::default_image_allocation; + } + + bool mem_alloc_success = true; + + if (image_allocation_functions.get_buffer != NULL) { + mem_alloc_success = image_allocation_functions.get_buffer(decctx, &spec, this, + alloc_userdata); + + pixels_confwin[0] = pixels[0] + left*WinUnitX + top*WinUnitY*stride; + + if (chroma_format != de265_chroma_mono) { + pixels_confwin[1] = pixels[1] + left + top*chroma_stride; + pixels_confwin[2] = pixels[2] + left + top*chroma_stride; + } + else { + pixels_confwin[1] = NULL; + pixels_confwin[2] = NULL; + } + + // check for memory shortage + + if (!mem_alloc_success) + { + return DE265_ERROR_OUT_OF_MEMORY; + } + } + + //alloc_functions = *allocfunc; + //alloc_userdata = userdata; + + // --- allocate decoding info arrays --- + + if (allocMetadata) { + // intra pred mode + + mem_alloc_success &= intraPredMode.alloc(sps->PicWidthInMinPUs, sps->PicHeightInMinPUs, + sps->Log2MinPUSize); + + mem_alloc_success &= intraPredModeC.alloc(sps->PicWidthInMinPUs, sps->PicHeightInMinPUs, + sps->Log2MinPUSize); + + // cb info + + mem_alloc_success &= cb_info.alloc(sps->PicWidthInMinCbsY, sps->PicHeightInMinCbsY, + sps->Log2MinCbSizeY); + + // pb info + + int puWidth = sps->PicWidthInMinCbsY << (sps->Log2MinCbSizeY -2); + int puHeight = sps->PicHeightInMinCbsY << (sps->Log2MinCbSizeY -2); + + mem_alloc_success &= pb_info.alloc(puWidth,puHeight, 2); + + + // tu info + + mem_alloc_success &= tu_info.alloc(sps->PicWidthInTbsY, sps->PicHeightInTbsY, + sps->Log2MinTrafoSize); + + // deblk info + + int deblk_w = (sps->pic_width_in_luma_samples +3)/4; + int deblk_h = (sps->pic_height_in_luma_samples+3)/4; + + mem_alloc_success &= deblk_info.alloc(deblk_w, deblk_h, 2); + + // CTB info + + if (ctb_info.data_size != sps->PicSizeInCtbsY) + { + delete[] ctb_progress; + + mem_alloc_success &= ctb_info.alloc(sps->PicWidthInCtbsY, sps->PicHeightInCtbsY, + sps->Log2CtbSizeY); + + ctb_progress = new de265_progress_lock[ ctb_info.data_size ]; + } + + + // check for memory shortage + + if (!mem_alloc_success) + { + return DE265_ERROR_OUT_OF_MEMORY; + } + } + + return DE265_OK; +} + + +de265_image::~de265_image() +{ + release(); + + // free progress locks + + if (ctb_progress) { + delete[] ctb_progress; + } + + de265_cond_destroy(&finished_cond); + de265_mutex_destroy(&mutex); +} + + +void de265_image::release() +{ + // free image memory + + if (pixels[0]) + { + /* + if (encoder_image_release_func != NULL) { + encoder_image_release_func(encctx, this, + encctx->param_image_allocation_userdata); + } + else*/ { + image_allocation_functions.release_buffer(decctx, this, + decctx ? + decctx->param_image_allocation_userdata : + NULL); + } + + for (int i=0;i<3;i++) + { + pixels[i] = NULL; + pixels_confwin[i] = NULL; + } + } + + // free slices + + for (int i=0;i=0) { + memset(pixels[0], y, stride * height); + } + + if (cb>=0) { + memset(pixels[1], cb, chroma_stride * chroma_height); + } + + if (cr>=0) { + memset(pixels[2], cr, chroma_stride * chroma_height); + } +} + + +de265_error de265_image::copy_image(const de265_image* src) +{ + /* TODO: actually, since we allocate the image only for internal purpose, we + do not have to call the external allocation routines for this. However, then + we have to track for each image how to release it again. + Another option would be to safe the copied data not in an de265_image at all. + */ + + de265_error err = alloc_image(src->width, src->height, src->chroma_format, src->sps, false, + src->decctx, /*src->encctx,*/ src->pts, src->user_data, false); + if (err != DE265_OK) { + return err; + } + + copy_lines_from(src, 0, src->height); + + return err; +} + + +// end = last line + 1 +void de265_image::copy_lines_from(const de265_image* src, int first, int end) +{ + if (end > src->height) end=src->height; + + assert(first % 2 == 0); + assert(end % 2 == 0); + + int luma_bpp = (sps->BitDepth_Y+7)/8; + int chroma_bpp = (sps->BitDepth_C+7)/8; + + if (src->stride == stride) { + memcpy(pixels[0] + first*stride * luma_bpp, + src->pixels[0] + first*src->stride * luma_bpp, + (end-first)*stride * luma_bpp); + } + else { + for (int yp=first;yppixels[0]+yp*src->stride * luma_bpp, + src->width * luma_bpp); + } + } + + int first_chroma = first / src->SubHeightC; + int end_chroma = end / src->SubHeightC; + + if (src->chroma_format != de265_chroma_mono) { + if (src->chroma_stride == chroma_stride) { + memcpy(pixels[1] + first_chroma*chroma_stride * chroma_bpp, + src->pixels[1] + first_chroma*chroma_stride * chroma_bpp, + (end_chroma-first_chroma) * chroma_stride * chroma_bpp); + memcpy(pixels[2] + first_chroma*chroma_stride * chroma_bpp, + src->pixels[2] + first_chroma*chroma_stride * chroma_bpp, + (end_chroma-first_chroma) * chroma_stride * chroma_bpp); + } + else { + for (int y=first_chroma;ypixels[1]+y*src->chroma_stride * chroma_bpp, + src->chroma_width * chroma_bpp); + memcpy(pixels[2]+y*chroma_stride * chroma_bpp, + src->pixels[2]+y*src->chroma_stride * chroma_bpp, + src->chroma_width * chroma_bpp); + } + } + } +} + + +void de265_image::exchange_pixel_data_with(de265_image& b) +{ + for (int i=0;i<3;i++) { + std::swap(pixels[i], b.pixels[i]); + std::swap(pixels_confwin[i], b.pixels_confwin[i]); + std::swap(plane_user_data[i], b.plane_user_data[i]); + } + + std::swap(stride, b.stride); + std::swap(chroma_stride, b.chroma_stride); + std::swap(image_allocation_functions, b.image_allocation_functions); +} + + +void de265_image::thread_start(int nThreads) +{ + de265_mutex_lock(&mutex); + + //printf("nThreads before: %d %d\n",nThreadsQueued, nThreadsTotal); + + nThreadsQueued += nThreads; + nThreadsTotal += nThreads; + + //printf("nThreads after: %d %d\n",nThreadsQueued, nThreadsTotal); + + de265_mutex_unlock(&mutex); +} + +void de265_image::thread_run(const thread_task* task) +{ + //printf("run thread %s\n", task->name().c_str()); + + de265_mutex_lock(&mutex); + nThreadsQueued--; + nThreadsRunning++; + de265_mutex_unlock(&mutex); +} + +void de265_image::thread_blocks() +{ + de265_mutex_lock(&mutex); + nThreadsRunning--; + nThreadsBlocked++; + de265_mutex_unlock(&mutex); +} + +void de265_image::thread_unblocks() +{ + de265_mutex_lock(&mutex); + nThreadsBlocked--; + nThreadsRunning++; + de265_mutex_unlock(&mutex); +} + +void de265_image::thread_finishes(const thread_task* task) +{ + //printf("finish thread %s\n", task->name().c_str()); + + de265_mutex_lock(&mutex); + + nThreadsRunning--; + nThreadsFinished++; + assert(nThreadsRunning >= 0); + + if (nThreadsFinished==nThreadsTotal) { + de265_cond_broadcast(&finished_cond, &mutex); + } + + de265_mutex_unlock(&mutex); +} + +void de265_image::wait_for_progress(thread_task* task, int ctbx,int ctby, int progress) +{ + const int ctbW = sps->PicWidthInCtbsY; + + wait_for_progress(task, ctbx + ctbW*ctby, progress); +} + +void de265_image::wait_for_progress(thread_task* task, int ctbAddrRS, int progress) +{ + if (task==NULL) { return; } + + de265_progress_lock* progresslock = &ctb_progress[ctbAddrRS]; + if (progresslock->get_progress() < progress) { + thread_blocks(); + + assert(task!=NULL); + task->state = thread_task::Blocked; + + /* TODO: check whether we are the first blocked task in the list. + If we are, we have to conceal input errors. + Simplest concealment: do not block. + */ + + progresslock->wait_for_progress(progress); + task->state = thread_task::Running; + thread_unblocks(); + } +} + + +void de265_image::wait_for_completion() +{ + de265_mutex_lock(&mutex); + while (nThreadsFinished!=nThreadsTotal) { + de265_cond_wait(&finished_cond, &mutex); + } + de265_mutex_unlock(&mutex); +} + +bool de265_image::debug_is_completed() const +{ + return nThreadsFinished==nThreadsTotal; +} + + + +void de265_image::clear_metadata() +{ + // TODO: maybe we could avoid the memset by ensuring that all data is written to + // during decoding (especially log2CbSize), but it is unlikely to be faster than the memset. + + cb_info.clear(); + //tu_info.clear(); // done on the fly + ctb_info.clear(); + deblk_info.clear(); + + // --- reset CTB progresses --- + + for (int i=0;i> log2PuSize; + int yPu = y >> log2PuSize; + int wPu = nPbW >> log2PuSize; + int hPu = nPbH >> log2PuSize; + + int stride = pb_info.width_in_units; + + for (int pby=0;pby=sps->pic_width_in_luma_samples || + yN>=sps->pic_height_in_luma_samples) return false; + + int minBlockAddrN = pps->MinTbAddrZS[ (xN>>sps->Log2MinTrafoSize) + + (yN>>sps->Log2MinTrafoSize) * sps->PicWidthInTbsY ]; + int minBlockAddrCurr = pps->MinTbAddrZS[ (xCurr>>sps->Log2MinTrafoSize) + + (yCurr>>sps->Log2MinTrafoSize) * sps->PicWidthInTbsY ]; + + if (minBlockAddrN > minBlockAddrCurr) return false; + + int xCurrCtb = xCurr >> sps->Log2CtbSizeY; + int yCurrCtb = yCurr >> sps->Log2CtbSizeY; + int xNCtb = xN >> sps->Log2CtbSizeY; + int yNCtb = yN >> sps->Log2CtbSizeY; + + if (get_SliceAddrRS(xCurrCtb,yCurrCtb) != + get_SliceAddrRS(xNCtb, yNCtb)) { + return false; + } + + if (pps->TileIdRS[xCurrCtb + yCurrCtb*sps->PicWidthInCtbsY] != + pps->TileIdRS[xNCtb + yNCtb *sps->PicWidthInCtbsY]) { + return false; + } + + return true; +} + + +bool de265_image::available_pred_blk(int xC,int yC, int nCbS, int xP, int yP, + int nPbW, int nPbH, int partIdx, int xN,int yN) const +{ + logtrace(LogMotion,"C:%d;%d P:%d;%d N:%d;%d size=%d;%d\n",xC,yC,xP,yP,xN,yN,nPbW,nPbH); + + int sameCb = (xC <= xN && xN < xC+nCbS && + yC <= yN && yN < yC+nCbS); + + bool availableN; + + if (!sameCb) { + availableN = available_zscan(xP,yP,xN,yN); + } + else { + availableN = !(nPbW<<1 == nCbS && nPbH<<1 == nCbS && // NxN + partIdx==1 && + yN >= yC+nPbH && xN < xC+nPbW); // xN/yN inside partIdx 2 + } + + if (availableN && get_pred_mode(xN,yN) == MODE_INTRA) { + availableN = false; + } + + return availableN; +} diff --git a/intrapred.cc b/intrapred.cc new file mode 100644 index 0000000..cf049b8 --- /dev/null +++ b/intrapred.cc @@ -0,0 +1,364 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "intrapred.h" +#include "transform.h" +#include "util.h" +#include + + +#include +#include + + + +void fillIntraPredModeCandidates(enum IntraPredMode candModeList[3], + enum IntraPredMode candIntraPredModeA, + enum IntraPredMode candIntraPredModeB) +{ + // build candidate list + + if (candIntraPredModeA == candIntraPredModeB) { + if (candIntraPredModeA < 2) { + candModeList[0] = INTRA_PLANAR; + candModeList[1] = INTRA_DC; + candModeList[2] = INTRA_ANGULAR_26; + } + else { + candModeList[0] = candIntraPredModeA; + candModeList[1] = (enum IntraPredMode)(2 + ((candIntraPredModeA-2 -1 +32) % 32)); + candModeList[2] = (enum IntraPredMode)(2 + ((candIntraPredModeA-2 +1 ) % 32)); + } + } + else { + candModeList[0] = candIntraPredModeA; + candModeList[1] = candIntraPredModeB; + + if (candIntraPredModeA != INTRA_PLANAR && + candIntraPredModeB != INTRA_PLANAR) { + candModeList[2] = INTRA_PLANAR; + } + else if (candIntraPredModeA != INTRA_DC && + candIntraPredModeB != INTRA_DC) { + candModeList[2] = INTRA_DC; + } + else { + candModeList[2] = INTRA_ANGULAR_26; + } + } + + /* + printf("candModeList: %d %d %d\n", + candModeList[0], + candModeList[1], + candModeList[2] + ); + */ +} + + +void fillIntraPredModeCandidates(enum IntraPredMode candModeList[3], int x,int y, int PUidx, + bool availableA, // left + bool availableB, // top + const de265_image* img) +{ + const seq_parameter_set* sps = &img->get_sps(); + + // block on left side + + enum IntraPredMode candIntraPredModeA, candIntraPredModeB; + if (availableA==false) { + candIntraPredModeA=INTRA_DC; + } + else if (img->get_pred_mode(x-1,y) != MODE_INTRA || + img->get_pcm_flag (x-1,y)) { + candIntraPredModeA=INTRA_DC; + } + else { + candIntraPredModeA = img->get_IntraPredMode_atIndex(PUidx-1); + } + + // block above + + if (availableB==false) { + candIntraPredModeB=INTRA_DC; + } + else if (img->get_pred_mode(x,y-1) != MODE_INTRA || + img->get_pcm_flag (x,y-1)) { + candIntraPredModeB=INTRA_DC; + } + else if (y-1 < ((y >> sps->Log2CtbSizeY) << sps->Log2CtbSizeY)) { + candIntraPredModeB=INTRA_DC; + } + else { + candIntraPredModeB = img->get_IntraPredMode_atIndex(PUidx-sps->PicWidthInMinPUs); + } + + + logtrace(LogSlice,"%d;%d candA:%d / candB:%d\n", x,y, + availableA ? candIntraPredModeA : -999, + availableB ? candIntraPredModeB : -999); + + + fillIntraPredModeCandidates(candModeList, + candIntraPredModeA, + candIntraPredModeB); +} + + +int find_intra_pred_mode(enum IntraPredMode mode, + enum IntraPredMode candModeList[3]) +{ + // check whether the mode is in the candidate list + + for (int i=0;i<3;i++) { + if (candModeList[i] == mode) { + return i; + } + } + + // sort candModeList + + if (candModeList[0] > candModeList[1]) { + std::swap(candModeList[0],candModeList[1]); + } + if (candModeList[0] > candModeList[2]) { + std::swap(candModeList[0],candModeList[2]); + } + if (candModeList[1] > candModeList[2]) { + std::swap(candModeList[1],candModeList[2]); + } + + // skip modes already in the candidate list + + int intraMode = mode; + + for (int i=2;i>=0;i--) { + if (intraMode >= candModeList[i]) { intraMode--; } + } + + return -intraMode-1; +} + + +void list_chroma_pred_candidates(enum IntraPredMode chroma_mode[5], + enum IntraPredMode luma_mode) +{ + enum IntraPredMode chroma_cand[5]; + chroma_cand[0] = INTRA_PLANAR; + chroma_cand[1] = INTRA_ANGULAR_26; + chroma_cand[2] = INTRA_ANGULAR_10; + chroma_cand[3] = INTRA_DC; + chroma_cand[4] = luma_mode; + + switch (luma_mode) { + case INTRA_PLANAR: chroma_cand[0] = INTRA_ANGULAR_34; break; + case INTRA_ANGULAR_26: chroma_cand[1] = INTRA_ANGULAR_34; break; + case INTRA_ANGULAR_10: chroma_cand[2] = INTRA_ANGULAR_34; break; + case INTRA_DC: chroma_cand[3] = INTRA_ANGULAR_34; break; + default: + // use defaults from above + break; + } +} + + +int get_intra_scan_idx(int log2TrafoSize, enum IntraPredMode intraPredMode, int cIdx, + const seq_parameter_set* sps) +{ + if (log2TrafoSize==2 || + (log2TrafoSize==3 && (cIdx==0 || + sps->ChromaArrayType==CHROMA_444))) { + /**/ if (intraPredMode >= 6 && intraPredMode <= 14) return 2; + else if (intraPredMode >= 22 && intraPredMode <= 30) return 1; + else return 0; + } + else { return 0; } +} + + +int get_intra_scan_idx_luma(int log2TrafoSize, enum IntraPredMode intraPredMode) +{ + if (log2TrafoSize==2 || log2TrafoSize==3) { + /**/ if (intraPredMode >= 6 && intraPredMode <= 14) return 2; + else if (intraPredMode >= 22 && intraPredMode <= 30) return 1; + else return 0; + } + else { return 0; } +} + +int get_intra_scan_idx_chroma(int log2TrafoSize, enum IntraPredMode intraPredMode) +{ + if (log2TrafoSize==1 || log2TrafoSize==2) { + /**/ if (intraPredMode >= 6 && intraPredMode <= 14) return 2; + else if (intraPredMode >= 22 && intraPredMode <= 30) return 1; + else return 0; + } + else { return 0; } +} + + +enum IntraPredMode lumaPredMode_to_chromaPredMode(enum IntraPredMode luma, + enum IntraChromaPredMode chroma) +{ + switch (chroma) { + case INTRA_CHROMA_LIKE_LUMA: + return luma; + + case INTRA_CHROMA_PLANAR_OR_34: + if (luma==INTRA_PLANAR) return INTRA_ANGULAR_34; + else return INTRA_PLANAR; + + case INTRA_CHROMA_ANGULAR_26_OR_34: + if (luma==INTRA_ANGULAR_26) return INTRA_ANGULAR_34; + else return INTRA_ANGULAR_26; + + case INTRA_CHROMA_ANGULAR_10_OR_34: + if (luma==INTRA_ANGULAR_10) return INTRA_ANGULAR_34; + else return INTRA_ANGULAR_10; + + case INTRA_CHROMA_DC_OR_34: + if (luma==INTRA_DC) return INTRA_ANGULAR_34; + else return INTRA_DC; + } + + + assert(false); + return INTRA_DC; +} + + + + +// (8.4.4.2.2) +template +void fill_border_samples(de265_image* img, + int xB,int yB, // in component specific resolution + int nT, int cIdx, + pixel_t* out_border) +{ + intra_border_computer c; + c.init(out_border, img, nT, cIdx, xB, yB); + c.preproc(); + c.fill_from_image(); + c.reference_sample_substitution(); +} + + +const int intraPredAngle_table[1+34] = + { 0, 0,32,26,21,17,13, 9, 5, 2, 0,-2,-5,-9,-13,-17,-21,-26, + -32,-26,-21,-17,-13,-9,-5,-2,0,2,5,9,13,17,21,26,32 }; + +const int invAngle_table[25-10] = + { -4096,-1638,-910,-630,-482,-390,-315,-256, + -315,-390,-482,-630,-910,-1638,-4096 }; + + +template +void decode_intra_prediction_internal(de265_image* img, + int xB0,int yB0, + enum IntraPredMode intraPredMode, + pixel_t* dst, int dstStride, + int nT, int cIdx) +{ + pixel_t border_pixels_mem[4*MAX_INTRA_PRED_BLOCK_SIZE+1]; + pixel_t* border_pixels = &border_pixels_mem[2*MAX_INTRA_PRED_BLOCK_SIZE]; + + fill_border_samples(img, xB0,yB0, nT, cIdx, border_pixels); + + if (img->get_sps().range_extension.intra_smoothing_disabled_flag == 0 && + (cIdx==0 || img->get_sps().ChromaArrayType==CHROMA_444)) + { + intra_prediction_sample_filtering(img->get_sps(), border_pixels, nT, cIdx, intraPredMode); + } + + + switch (intraPredMode) { + case INTRA_PLANAR: + intra_prediction_planar(dst,dstStride, nT,cIdx, border_pixels); + break; + case INTRA_DC: + intra_prediction_DC(dst,dstStride, nT,cIdx, border_pixels); + break; + default: + { + int bit_depth = img->get_bit_depth(cIdx); + bool disableIntraBoundaryFilter = + (img->get_sps().range_extension.implicit_rdpcm_enabled_flag && + img->get_cu_transquant_bypass(xB0,yB0)); + + intra_prediction_angular(dst,dstStride, bit_depth,disableIntraBoundaryFilter, + xB0,yB0,intraPredMode,nT,cIdx, border_pixels); + } + break; + } +} + + +// (8.4.4.2.1) +void decode_intra_prediction(de265_image* img, + int xB0,int yB0, + enum IntraPredMode intraPredMode, + int nT, int cIdx) +{ + logtrace(LogIntraPred,"decode_intra_prediction xy0:%d/%d mode=%d nT=%d, cIdx=%d\n", + xB0,yB0, intraPredMode, nT,cIdx); + /* + printf("decode_intra_prediction xy0:%d/%d mode=%d nT=%d, cIdx=%d\n", + xB0,yB0, intraPredMode, nT,cIdx); + */ + + if (img->high_bit_depth(cIdx)) { + decode_intra_prediction_internal(img,xB0,yB0, intraPredMode, + img->get_image_plane_at_pos_NEW(cIdx,xB0,yB0), + img->get_image_stride(cIdx), + nT,cIdx); + } + else { + decode_intra_prediction_internal(img,xB0,yB0, intraPredMode, + img->get_image_plane_at_pos_NEW(cIdx,xB0,yB0), + img->get_image_stride(cIdx), + nT,cIdx); + } +} + + +// TODO: remove this +template <> void decode_intra_prediction(de265_image* img, + int xB0,int yB0, + enum IntraPredMode intraPredMode, + uint8_t* dst, int nT, int cIdx) +{ + decode_intra_prediction_internal(img,xB0,yB0, intraPredMode, + dst,nT, + nT,cIdx); +} + + +// TODO: remove this +template <> void decode_intra_prediction(de265_image* img, + int xB0,int yB0, + enum IntraPredMode intraPredMode, + uint16_t* dst, int nT, int cIdx) +{ + decode_intra_prediction_internal(img,xB0,yB0, intraPredMode, + dst,nT, + nT,cIdx); +} diff --git a/libde265/acceleration.h b/libde265/acceleration.h new file mode 100644 index 0000000..2f1148b --- /dev/null +++ b/libde265/acceleration.h @@ -0,0 +1,359 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef DE265_ACCELERATION_H +#define DE265_ACCELERATION_H + +#include +#include +#include + + +struct acceleration_functions +{ + void (*put_weighted_pred_avg_8)(uint8_t *_dst, ptrdiff_t dststride, + const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride, + int width, int height); + + void (*put_unweighted_pred_8)(uint8_t *_dst, ptrdiff_t dststride, + const int16_t *src, ptrdiff_t srcstride, + int width, int height); + + void (*put_weighted_pred_8)(uint8_t *_dst, ptrdiff_t dststride, + const int16_t *src, ptrdiff_t srcstride, + int width, int height, + int w,int o,int log2WD); + void (*put_weighted_bipred_8)(uint8_t *_dst, ptrdiff_t dststride, + const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride, + int width, int height, + int w1,int o1, int w2,int o2, int log2WD); + + + void (*put_weighted_pred_avg_16)(uint16_t *_dst, ptrdiff_t dststride, + const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride, + int width, int height, int bit_depth); + + void (*put_unweighted_pred_16)(uint16_t *_dst, ptrdiff_t dststride, + const int16_t *src, ptrdiff_t srcstride, + int width, int height, int bit_depth); + + void (*put_weighted_pred_16)(uint16_t *_dst, ptrdiff_t dststride, + const int16_t *src, ptrdiff_t srcstride, + int width, int height, + int w,int o,int log2WD, int bit_depth); + void (*put_weighted_bipred_16)(uint16_t *_dst, ptrdiff_t dststride, + const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride, + int width, int height, + int w1,int o1, int w2,int o2, int log2WD, int bit_depth); + + + void put_weighted_pred_avg(void *_dst, ptrdiff_t dststride, + const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride, + int width, int height, int bit_depth) const; + + void put_unweighted_pred(void *_dst, ptrdiff_t dststride, + const int16_t *src, ptrdiff_t srcstride, + int width, int height, int bit_depth) const; + + void put_weighted_pred(void *_dst, ptrdiff_t dststride, + const int16_t *src, ptrdiff_t srcstride, + int width, int height, + int w,int o,int log2WD, int bit_depth) const; + void put_weighted_bipred(void *_dst, ptrdiff_t dststride, + const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride, + int width, int height, + int w1,int o1, int w2,int o2, int log2WD, int bit_depth) const; + + + + + void (*put_hevc_epel_8)(int16_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, int width, int height, + int mx, int my, int16_t* mcbuffer); + void (*put_hevc_epel_h_8)(int16_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, int width, int height, + int mx, int my, int16_t* mcbuffer, int bit_depth); + void (*put_hevc_epel_v_8)(int16_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, int width, int height, + int mx, int my, int16_t* mcbuffer, int bit_depth); + void (*put_hevc_epel_hv_8)(int16_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, int width, int height, + int mx, int my, int16_t* mcbuffer, int bit_depth); + + void (*put_hevc_qpel_8[4][4])(int16_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, int width, int height, + int16_t* mcbuffer); + + + void (*put_hevc_epel_16)(int16_t *dst, ptrdiff_t dststride, + const uint16_t *src, ptrdiff_t srcstride, int width, int height, + int mx, int my, int16_t* mcbuffer, int bit_depth); + void (*put_hevc_epel_h_16)(int16_t *dst, ptrdiff_t dststride, + const uint16_t *src, ptrdiff_t srcstride, int width, int height, + int mx, int my, int16_t* mcbuffer, int bit_depth); + void (*put_hevc_epel_v_16)(int16_t *dst, ptrdiff_t dststride, + const uint16_t *src, ptrdiff_t srcstride, int width, int height, + int mx, int my, int16_t* mcbuffer, int bit_depth); + void (*put_hevc_epel_hv_16)(int16_t *dst, ptrdiff_t dststride, + const uint16_t *src, ptrdiff_t srcstride, int width, int height, + int mx, int my, int16_t* mcbuffer, int bit_depth); + + void (*put_hevc_qpel_16[4][4])(int16_t *dst, ptrdiff_t dststride, + const uint16_t *src, ptrdiff_t srcstride, int width, int height, + int16_t* mcbuffer, int bit_depth); + + + void put_hevc_epel(int16_t *dst, ptrdiff_t dststride, + const void *src, ptrdiff_t srcstride, int width, int height, + int mx, int my, int16_t* mcbuffer, int bit_depth) const; + void put_hevc_epel_h(int16_t *dst, ptrdiff_t dststride, + const void *src, ptrdiff_t srcstride, int width, int height, + int mx, int my, int16_t* mcbuffer, int bit_depth) const; + void put_hevc_epel_v(int16_t *dst, ptrdiff_t dststride, + const void *src, ptrdiff_t srcstride, int width, int height, + int mx, int my, int16_t* mcbuffer, int bit_depth) const; + void put_hevc_epel_hv(int16_t *dst, ptrdiff_t dststride, + const void *src, ptrdiff_t srcstride, int width, int height, + int mx, int my, int16_t* mcbuffer, int bit_depth) const; + + void put_hevc_qpel(int16_t *dst, ptrdiff_t dststride, + const void *src, ptrdiff_t srcstride, int width, int height, + int16_t* mcbuffer, int dX,int dY, int bit_depth) const; + + + // --- inverse transforms --- + + void (*transform_bypass)(int32_t *residual, const int16_t *coeffs, int nT); + void (*transform_bypass_rdpcm_v)(int32_t *r, const int16_t *coeffs, int nT); + void (*transform_bypass_rdpcm_h)(int32_t *r, const int16_t *coeffs, int nT); + + // 8 bit + + void (*transform_skip_8)(uint8_t *_dst, const int16_t *coeffs, ptrdiff_t _stride); // no transform + void (*transform_skip_rdpcm_v_8)(uint8_t *_dst, const int16_t *coeffs, int nT, ptrdiff_t _stride); + void (*transform_skip_rdpcm_h_8)(uint8_t *_dst, const int16_t *coeffs, int nT, ptrdiff_t _stride); + void (*transform_4x4_dst_add_8)(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride); // iDST + void (*transform_add_8[4])(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride); // iDCT + + // 9-16 bit + + void (*transform_skip_16)(uint16_t *_dst, const int16_t *coeffs, ptrdiff_t _stride, int bit_depth); // no transform + void (*transform_4x4_dst_add_16)(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth); // iDST + void (*transform_add_16[4])(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth); // iDCT + + + void (*rotate_coefficients)(int16_t *coeff, int nT); + + void (*transform_idst_4x4)(int32_t *dst, const int16_t *coeffs, int bdShift, int max_coeff_bits); + void (*transform_idct_4x4)(int32_t *dst, const int16_t *coeffs, int bdShift, int max_coeff_bits); + void (*transform_idct_8x8)(int32_t *dst, const int16_t *coeffs, int bdShift, int max_coeff_bits); + void (*transform_idct_16x16)(int32_t *dst,const int16_t *coeffs,int bdShift, int max_coeff_bits); + void (*transform_idct_32x32)(int32_t *dst,const int16_t *coeffs,int bdShift, int max_coeff_bits); + void (*add_residual_8)(uint8_t *dst, ptrdiff_t stride, const int32_t* r, int nT, int bit_depth); + void (*add_residual_16)(uint16_t *dst,ptrdiff_t stride,const int32_t* r, int nT, int bit_depth); + + template + void add_residual(pixel_t *dst, ptrdiff_t stride, const int32_t* r, int nT, int bit_depth) const; + + void (*rdpcm_v)(int32_t* residual, const int16_t* coeffs, int nT,int tsShift,int bdShift); + void (*rdpcm_h)(int32_t* residual, const int16_t* coeffs, int nT,int tsShift,int bdShift); + + void (*transform_skip_residual)(int32_t *residual, const int16_t *coeffs, int nT, + int tsShift,int bdShift); + + + template void transform_skip(pixel_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth) const; + template void transform_skip_rdpcm_v(pixel_t *dst, const int16_t *coeffs, int nT, ptrdiff_t stride, int bit_depth) const; + template void transform_skip_rdpcm_h(pixel_t *dst, const int16_t *coeffs, int nT, ptrdiff_t stride, int bit_depth) const; + template void transform_4x4_dst_add(pixel_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth) const; + template void transform_add(int sizeIdx, pixel_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth) const; + + + + // --- forward transforms --- + + void (*fwd_transform_4x4_dst_8)(int16_t *coeffs, const int16_t* src, ptrdiff_t stride); // fDST + + // indexed with (log2TbSize-2) + void (*fwd_transform_8[4]) (int16_t *coeffs, const int16_t *src, ptrdiff_t stride); // fDCT + + + // forward Hadamard transform (without scaling factor) + // (4x4,8x8,16x16,32x32) indexed with (log2TbSize-2) + void (*hadamard_transform_8[4]) (int16_t *coeffs, const int16_t *src, ptrdiff_t stride); +}; + + +/* +template <> inline void acceleration_functions::put_weighted_pred_avg(uint8_t *_dst, ptrdiff_t dststride, + const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride, + int width, int height, int bit_depth) { put_weighted_pred_avg_8(_dst,dststride,src1,src2,srcstride,width,height); } +template <> inline void acceleration_functions::put_weighted_pred_avg(uint16_t *_dst, ptrdiff_t dststride, + const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride, + int width, int height, int bit_depth) { put_weighted_pred_avg_16(_dst,dststride,src1,src2, + srcstride,width,height,bit_depth); } + +template <> inline void acceleration_functions::put_unweighted_pred(uint8_t *_dst, ptrdiff_t dststride, + const int16_t *src, ptrdiff_t srcstride, + int width, int height, int bit_depth) { put_unweighted_pred_8(_dst,dststride,src,srcstride,width,height); } +template <> inline void acceleration_functions::put_unweighted_pred(uint16_t *_dst, ptrdiff_t dststride, + const int16_t *src, ptrdiff_t srcstride, + int width, int height, int bit_depth) { put_unweighted_pred_16(_dst,dststride,src,srcstride,width,height,bit_depth); } + +template <> inline void acceleration_functions::put_weighted_pred(uint8_t *_dst, ptrdiff_t dststride, + const int16_t *src, ptrdiff_t srcstride, + int width, int height, + int w,int o,int log2WD, int bit_depth) { put_weighted_pred_8(_dst,dststride,src,srcstride,width,height,w,o,log2WD); } +template <> inline void acceleration_functions::put_weighted_pred(uint16_t *_dst, ptrdiff_t dststride, + const int16_t *src, ptrdiff_t srcstride, + int width, int height, + int w,int o,int log2WD, int bit_depth) { put_weighted_pred_16(_dst,dststride,src,srcstride,width,height,w,o,log2WD,bit_depth); } + +template <> inline void acceleration_functions::put_weighted_bipred(uint8_t *_dst, ptrdiff_t dststride, + const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride, + int width, int height, + int w1,int o1, int w2,int o2, int log2WD, int bit_depth) { put_weighted_bipred_8(_dst,dststride,src1,src2,srcstride, + width,height, + w1,o1,w2,o2,log2WD); } +template <> inline void acceleration_functions::put_weighted_bipred(uint16_t *_dst, ptrdiff_t dststride, + const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride, + int width, int height, + int w1,int o1, int w2,int o2, int log2WD, int bit_depth) { put_weighted_bipred_16(_dst,dststride,src1,src2,srcstride, + width,height, + w1,o1,w2,o2,log2WD,bit_depth); } +*/ + + +inline void acceleration_functions::put_weighted_pred_avg(void* _dst, ptrdiff_t dststride, + const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride, + int width, int height, int bit_depth) const +{ + if (bit_depth <= 8) + put_weighted_pred_avg_8((uint8_t*)_dst,dststride,src1,src2,srcstride,width,height); + else + put_weighted_pred_avg_16((uint16_t*)_dst,dststride,src1,src2,srcstride,width,height,bit_depth); +} + + +inline void acceleration_functions::put_unweighted_pred(void* _dst, ptrdiff_t dststride, + const int16_t *src, ptrdiff_t srcstride, + int width, int height, int bit_depth) const +{ + if (bit_depth <= 8) + put_unweighted_pred_8((uint8_t*)_dst,dststride,src,srcstride,width,height); + else + put_unweighted_pred_16((uint16_t*)_dst,dststride,src,srcstride,width,height,bit_depth); +} + + +inline void acceleration_functions::put_weighted_pred(void* _dst, ptrdiff_t dststride, + const int16_t *src, ptrdiff_t srcstride, + int width, int height, + int w,int o,int log2WD, int bit_depth) const +{ + if (bit_depth <= 8) + put_weighted_pred_8((uint8_t*)_dst,dststride,src,srcstride,width,height,w,o,log2WD); + else + put_weighted_pred_16((uint16_t*)_dst,dststride,src,srcstride,width,height,w,o,log2WD,bit_depth); +} + + +inline void acceleration_functions::put_weighted_bipred(void* _dst, ptrdiff_t dststride, + const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride, + int width, int height, + int w1,int o1, int w2,int o2, int log2WD, int bit_depth) const +{ + if (bit_depth <= 8) + put_weighted_bipred_8((uint8_t*)_dst,dststride,src1,src2,srcstride, width,height, w1,o1,w2,o2,log2WD); + else + put_weighted_bipred_16((uint16_t*)_dst,dststride,src1,src2,srcstride, width,height, w1,o1,w2,o2,log2WD,bit_depth); +} + + + +inline void acceleration_functions::put_hevc_epel(int16_t *dst, ptrdiff_t dststride, + const void *src, ptrdiff_t srcstride, int width, int height, + int mx, int my, int16_t* mcbuffer, int bit_depth) const +{ + if (bit_depth <= 8) + put_hevc_epel_8(dst,dststride,(const uint8_t*)src,srcstride,width,height,mx,my,mcbuffer); + else + put_hevc_epel_16(dst,dststride,(const uint16_t*)src,srcstride,width,height,mx,my,mcbuffer, bit_depth); +} + +inline void acceleration_functions::put_hevc_epel_h(int16_t *dst, ptrdiff_t dststride, + const void *src, ptrdiff_t srcstride, int width, int height, + int mx, int my, int16_t* mcbuffer, int bit_depth) const +{ + if (bit_depth <= 8) + put_hevc_epel_h_8(dst,dststride,(const uint8_t*)src,srcstride,width,height,mx,my,mcbuffer,bit_depth); + else + put_hevc_epel_h_16(dst,dststride,(const uint16_t*)src,srcstride,width,height,mx,my,mcbuffer,bit_depth); +} + +inline void acceleration_functions::put_hevc_epel_v(int16_t *dst, ptrdiff_t dststride, + const void *src, ptrdiff_t srcstride, int width, int height, + int mx, int my, int16_t* mcbuffer, int bit_depth) const +{ + if (bit_depth <= 8) + put_hevc_epel_v_8(dst,dststride,(const uint8_t*)src,srcstride,width,height,mx,my,mcbuffer,bit_depth); + else + put_hevc_epel_v_16(dst,dststride,(const uint16_t*)src,srcstride,width,height,mx,my,mcbuffer, bit_depth); +} + +inline void acceleration_functions::put_hevc_epel_hv(int16_t *dst, ptrdiff_t dststride, + const void *src, ptrdiff_t srcstride, int width, int height, + int mx, int my, int16_t* mcbuffer, int bit_depth) const +{ + if (bit_depth <= 8) + put_hevc_epel_hv_8(dst,dststride,(const uint8_t*)src,srcstride,width,height,mx,my,mcbuffer,bit_depth); + else + put_hevc_epel_hv_16(dst,dststride,(const uint16_t*)src,srcstride,width,height,mx,my,mcbuffer, bit_depth); +} + +inline void acceleration_functions::put_hevc_qpel(int16_t *dst, ptrdiff_t dststride, + const void *src, ptrdiff_t srcstride, int width, int height, + int16_t* mcbuffer, int dX,int dY, int bit_depth) const +{ + if (bit_depth <= 8) + put_hevc_qpel_8[dX][dY](dst,dststride,(const uint8_t*)src,srcstride,width,height,mcbuffer); + else + put_hevc_qpel_16[dX][dY](dst,dststride,(const uint16_t*)src,srcstride,width,height,mcbuffer, bit_depth); +} + +template <> inline void acceleration_functions::transform_skip(uint8_t *dst, const int16_t *coeffs,ptrdiff_t stride, int bit_depth) const { transform_skip_8(dst,coeffs,stride); } +template <> inline void acceleration_functions::transform_skip(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth) const { transform_skip_16(dst,coeffs,stride, bit_depth); } + +template <> inline void acceleration_functions::transform_skip_rdpcm_v(uint8_t *dst, const int16_t *coeffs, int nT, ptrdiff_t stride, int bit_depth) const { assert(bit_depth==8); transform_skip_rdpcm_v_8(dst,coeffs,nT,stride); } +template <> inline void acceleration_functions::transform_skip_rdpcm_h(uint8_t *dst, const int16_t *coeffs, int nT, ptrdiff_t stride, int bit_depth) const { assert(bit_depth==8); transform_skip_rdpcm_h_8(dst,coeffs,nT,stride); } +template <> inline void acceleration_functions::transform_skip_rdpcm_v(uint16_t *dst, const int16_t *coeffs, int nT, ptrdiff_t stride, int bit_depth) const { assert(false); /*transform_skip_rdpcm_v_8(dst,coeffs,nT,stride);*/ } +template <> inline void acceleration_functions::transform_skip_rdpcm_h(uint16_t *dst, const int16_t *coeffs, int nT, ptrdiff_t stride, int bit_depth) const { assert(false); /*transform_skip_rdpcm_h_8(dst,coeffs,nT,stride);*/ } + + +template <> inline void acceleration_functions::transform_4x4_dst_add(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride,int bit_depth) const { transform_4x4_dst_add_8(dst,coeffs,stride); } +template <> inline void acceleration_functions::transform_4x4_dst_add(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride,int bit_depth) const { transform_4x4_dst_add_16(dst,coeffs,stride,bit_depth); } + +template <> inline void acceleration_functions::transform_add(int sizeIdx, uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth) const { transform_add_8[sizeIdx](dst,coeffs,stride); } +template <> inline void acceleration_functions::transform_add(int sizeIdx, uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth) const { transform_add_16[sizeIdx](dst,coeffs,stride,bit_depth); } + +template <> inline void acceleration_functions::add_residual(uint8_t *dst, ptrdiff_t stride, const int32_t* r, int nT, int bit_depth) const { add_residual_8(dst,stride,r,nT,bit_depth); } +template <> inline void acceleration_functions::add_residual(uint16_t *dst, ptrdiff_t stride, const int32_t* r, int nT, int bit_depth) const { add_residual_16(dst,stride,r,nT,bit_depth); } + +#endif diff --git a/libde265/alloc_pool.h b/libde265/alloc_pool.h new file mode 100644 index 0000000..41dd4a4 --- /dev/null +++ b/libde265/alloc_pool.h @@ -0,0 +1,61 @@ +/* + * H.265 video codec. + * Copyright (c) 2014 struktur AG, Dirk Farin + * + * Authors: Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef ALLOC_POOL_H +#define ALLOC_POOL_H + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include +#include +#ifdef HAVE_STDINT_H +#include +#endif +#ifdef HAVE_CSTDINT +#include +#endif + + +class alloc_pool +{ + public: + alloc_pool(size_t objSize, int poolSize=1000, bool grow=true); + ~alloc_pool(); + + void* new_obj(const size_t size); + void delete_obj(void*); + void purge(); + + private: + size_t mObjSize; + int mPoolSize; + bool mGrow; + + std::vector m_memBlocks; + std::vector m_freeList; + + void add_memory_block(); +}; + +#endif diff --git a/libde265/bitstream.h b/libde265/bitstream.h new file mode 100644 index 0000000..3100b43 --- /dev/null +++ b/libde265/bitstream.h @@ -0,0 +1,63 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef DE265_BITSTREAM_H +#define DE265_BITSTREAM_H + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#ifdef HAVE_STDBOOL_H +#include +#endif +#include + + +#define MAX_UVLC_LEADING_ZEROS 20 +#define UVLC_ERROR -99999 + + +typedef struct { + uint8_t* data; + int bytes_remaining; + + uint64_t nextbits; // left-aligned bits + int nextbits_cnt; +} bitreader; + +void bitreader_init(bitreader*, unsigned char* buffer, int len); +void bitreader_refill(bitreader*); // refill to at least 56+1 bits +int next_bit(bitreader*); +int next_bit_norefill(bitreader*); +int get_bits(bitreader*, int n); +int get_bits_fast(bitreader*, int n); +int peek_bits(bitreader*, int n); +void skip_bits(bitreader*, int n); +void skip_bits_fast(bitreader*, int n); +void skip_to_byte_boundary(bitreader*); +void prepare_for_CABAC(bitreader*); +int get_uvlc(bitreader*); // may return UVLC_ERROR +int get_svlc(bitreader*); // may return UVLC_ERROR + +bool check_rbsp_trailing_bits(bitreader*); // return true if remaining filler bits are all zero + +#endif diff --git a/libde265/cabac.h b/libde265/cabac.h new file mode 100644 index 0000000..e28aeeb --- /dev/null +++ b/libde265/cabac.h @@ -0,0 +1,211 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef DE265_CABAC_H +#define DE265_CABAC_H + +#include +#include "contextmodel.h" + + +typedef struct { + uint8_t* bitstream_start; + uint8_t* bitstream_curr; + uint8_t* bitstream_end; + + uint32_t range; + uint32_t value; + int16_t bits_needed; +} CABAC_decoder; + + +void init_CABAC_decoder(CABAC_decoder* decoder, uint8_t* bitstream, int length); +void init_CABAC_decoder_2(CABAC_decoder* decoder); +int decode_CABAC_bit(CABAC_decoder* decoder, context_model* model); +int decode_CABAC_TU(CABAC_decoder* decoder, int cMax, context_model* model); +int decode_CABAC_term_bit(CABAC_decoder* decoder); + +int decode_CABAC_bypass(CABAC_decoder* decoder); +int decode_CABAC_TU_bypass(CABAC_decoder* decoder, int cMax); +int decode_CABAC_FL_bypass(CABAC_decoder* decoder, int nBits); +int decode_CABAC_TR_bypass(CABAC_decoder* decoder, int cRiceParam, int cTRMax); +int decode_CABAC_EGk_bypass(CABAC_decoder* decoder, int k); + + +// --------------------------------------------------------------------------- + +class CABAC_encoder +{ +public: + CABAC_encoder() : mCtxModels(NULL) { } + virtual ~CABAC_encoder() { } + + virtual int size() const = 0; + virtual void reset() = 0; + + // --- VLC --- + + virtual void write_bits(uint32_t bits,int n) = 0; + virtual void write_bit(int bit) { write_bits(bit,1); } + virtual void write_uvlc(int value); + virtual void write_svlc(int value); + virtual void write_startcode() = 0; + virtual void skip_bits(int nBits) = 0; + + virtual void add_trailing_bits(); + virtual int number_free_bits_in_byte() const = 0; + + // output all remaining bits and fill with zeros to next byte boundary + virtual void flush_VLC() { } + + + // --- CABAC --- + + void set_context_models(context_model_table* models) { mCtxModels=models; } + + virtual void init_CABAC() { } + virtual void write_CABAC_bit(int modelIdx, int bit) = 0; + virtual void write_CABAC_bypass(int bit) = 0; + virtual void write_CABAC_TU_bypass(int value, int cMax); + virtual void write_CABAC_FL_bypass(int value, int nBits); + virtual void write_CABAC_term_bit(int bit) = 0; + virtual void flush_CABAC() { } + + void write_CABAC_EGk(int absolute_symbol, int k); // absolute_symbol >= 0 + + virtual bool modifies_context() const = 0; + + float RDBits_for_CABAC_bin(int modelIdx, int bit); + + protected: + context_model_table* mCtxModels; +}; + + +class CABAC_encoder_bitstream : public CABAC_encoder +{ +public: + CABAC_encoder_bitstream(); + ~CABAC_encoder_bitstream(); + + virtual void reset(); + + virtual int size() const { return data_size; } + uint8_t* data() const { return data_mem; } + + // --- VLC --- + + virtual void write_bits(uint32_t bits,int n); + virtual void write_startcode(); + virtual void skip_bits(int nBits); + + virtual int number_free_bits_in_byte() const; + + // output all remaining bits and fill with zeros to next byte boundary + virtual void flush_VLC(); + + + // --- CABAC --- + + virtual void init_CABAC(); + virtual void write_CABAC_bit(int modelIdx, int bit); + virtual void write_CABAC_bypass(int bit); + virtual void write_CABAC_term_bit(int bit); + virtual void flush_CABAC(); + + virtual bool modifies_context() const { return true; } + +private: + // data buffer + + uint8_t* data_mem; + uint32_t data_capacity; + uint32_t data_size; + char state; // for inserting emulation-prevention bytes + + // VLC + + uint32_t vlc_buffer; + uint32_t vlc_buffer_len; + + + // CABAC + + uint32_t range; + uint32_t low; + int8_t bits_left; + uint8_t buffered_byte; + uint16_t num_buffered_bytes; + + + void check_size_and_resize(int nBytes); + void testAndWriteOut(); + void write_out(); + void append_byte(int byte); +}; + + +class CABAC_encoder_estim : public CABAC_encoder +{ +public: + CABAC_encoder_estim() : mFracBits(0) { } + + virtual void reset() { mFracBits=0; } + + virtual int size() const { return mFracBits>>(15+3); } + + uint64_t getFracBits() const { return mFracBits; } + float getRDBits() const { return mFracBits / float(1<<15); } + + // --- VLC --- + + virtual void write_bits(uint32_t bits,int n) { mFracBits += n<<15; } + virtual void write_bit(int bit) { mFracBits+=1<<15; } + virtual void write_startcode() { mFracBits += (1<<15)*8*3; } + virtual void skip_bits(int nBits) { mFracBits += nBits<<15; } + virtual int number_free_bits_in_byte() const { return 0; } // TODO, good enough for now + + // --- CABAC --- + + virtual void write_CABAC_bit(int modelIdx, int bit); + virtual void write_CABAC_bypass(int bit) { + mFracBits += 0x8000; + } + virtual void write_CABAC_FL_bypass(int value, int nBits) { + mFracBits += nBits<<15; + } + virtual void write_CABAC_term_bit(int bit) { /* not implemented (not needed) */ } + + virtual bool modifies_context() const { return true; } + + protected: + uint64_t mFracBits; +}; + + +class CABAC_encoder_estim_constant : public CABAC_encoder_estim +{ + public: + void write_CABAC_bit(int modelIdx, int bit); + + virtual bool modifies_context() const { return false; } +}; + +#endif diff --git a/libde265/configparam.h b/libde265/configparam.h new file mode 100644 index 0000000..58b1daa --- /dev/null +++ b/libde265/configparam.h @@ -0,0 +1,401 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * Authors: struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef CONFIG_PARAM_H +#define CONFIG_PARAM_H + +#include "en265.h" +#include "util.h" + +#include +#include +#include +#include +#include + + +/* Notes: probably best to keep cmd-line-options here. So it will be: + - automatically consistent even when having different combinations of algorithms + - no other place to edit + - if needed, one can still override it at another place + */ + +// TODO: set a stack of default prefixes in config_parameters, such that all options added +// will receive this namespace prefix. + +// TODO: add the possibility to remove long options again, i.e., not use the default id name +class option_base +{ + public: + option_base() : mShortOption(0), mLongOption(NULL) { } + option_base(const char* name) : mIDName(name), mShortOption(0), mLongOption(NULL) { } + virtual ~option_base() { } + + + // --- option identifier --- + + void set_ID(const char* name) { mIDName=name; } + void add_namespace_prefix(std::string prefix) { mPrefix = prefix + ":" + mPrefix; } + + std::string get_name() const { return mPrefix + mIDName; } + + + // --- description --- + + void set_description(std::string descr) { mDescription = descr; } + std::string get_description() const { return mDescription; } + bool has_description() const { return !mDescription.empty(); } + + + // --- value --- + + virtual bool is_defined() const = 0; + bool is_undefined() const { return !is_defined(); } + + virtual bool has_default() const = 0; + + + // --- command line options ---- + + void set_cmd_line_options(const char* long_option, char short_option = 0) + { + mShortOption = short_option; + mLongOption = long_option; + } + + void set_short_option(char short_option) { mShortOption=short_option; } + + void unsetCmdLineOption() + { + mShortOption = 0; + mLongOption = NULL; + } + + bool hasShortOption() const { return mShortOption!=0; } + char getShortOption() const { return mShortOption; } + bool hasLongOption() const { return true; } //mLongOption!=NULL; } + std::string getLongOption() const { return mLongOption ? std::string(mLongOption) : get_name(); } + + virtual LIBDE265_API bool processCmdLineArguments(char** argv, int* argc, int idx) { return false; } + + + + virtual std::string getTypeDescr() const = 0; + + virtual std::string get_default_string() const { return "N/A"; } + + private: + std::string mPrefix; + std::string mIDName; + + std::string mDescription; + + char mShortOption; + const char* mLongOption; +}; + + + +class option_bool : public option_base +{ +public: + option_bool() : value_set(false), default_set(false) { } + + operator bool() const { + assert(value_set || default_set); + return value_set ? value : default_value; + } + + virtual bool is_defined() const { return value_set || default_set; } + virtual bool has_default() const { return default_set; } + + void set_default(bool v) { default_value=v; default_set=true; } + virtual std::string get_default_string() const { return default_value ? "true":"false"; } + + virtual std::string getTypeDescr() const { return "(boolean)"; } + virtual LIBDE265_API bool processCmdLineArguments(char** argv, int* argc, int idx) { set(true); return true; } + + bool set(bool v) { value_set=true; value=v; return true; } + + private: + bool value_set; + bool value; + + bool default_set; + bool default_value; +}; + + +class option_string : public option_base +{ +public: + option_string() : value_set(false), default_set(false) { } + + const option_string& operator=(std::string v) { value=v; value_set=true; return *this; } + + operator std::string() const { return get(); } + std::string get() const { + assert(value_set || default_set); + return value_set ? value : default_value; + } + + virtual bool is_defined() const { return value_set || default_set; } + virtual bool has_default() const { return default_set; } + + void set_default(std::string v) { default_value=v; default_set=true; } + virtual LIBDE265_API std::string get_default_string() const { return default_value; } + + virtual LIBDE265_API std::string getTypeDescr() const { return "(string)"; } + virtual LIBDE265_API bool processCmdLineArguments(char** argv, int* argc, int idx); + + bool set(std::string v) { value_set=true; value=v; return true; } + + private: + bool value_set; + std::string value; + + bool default_set; + std::string default_value; +}; + + +class option_int : public option_base +{ +public: + option_int() : value_set(false), default_set(false), + have_low_limit(false), have_high_limit(false) { } + + void set_minimum(int mini) { have_low_limit =true; low_limit =mini; } + void set_maximum(int maxi) { have_high_limit=true; high_limit=maxi; } + void set_range(int mini,int maxi); + void set_valid_values(const std::vector& v) { valid_values_set = v; } + + const option_int& operator=(int v) { value=v; value_set=true; return *this; } + + int operator() () const { + assert(value_set || default_set); + return value_set ? value : default_value; + } + operator int() const { return operator()(); } + + virtual bool is_defined() const { return value_set || default_set; } + virtual bool has_default() const { return default_set; } + + void set_default(int v) { default_value=v; default_set=true; } + virtual LIBDE265_API std::string get_default_string() const; + + virtual LIBDE265_API std::string getTypeDescr() const; + virtual LIBDE265_API bool processCmdLineArguments(char** argv, int* argc, int idx); + + bool set(int v) { + if (is_valid(v)) { value_set=true; value=v; return true; } + else { return false; } + } + + private: + bool value_set; + int value; + + bool default_set; + int default_value; + + bool have_low_limit, have_high_limit; + int low_limit, high_limit; + + std::vector valid_values_set; + + bool is_valid(int v) const; +}; + + + +class choice_option_base : public option_base +{ +public: + choice_option_base() : choice_string_table(NULL) { } + ~choice_option_base() { delete[] choice_string_table; } + + bool set(std::string v) { return set_value(v); } + virtual bool set_value(const std::string& val) = 0; + virtual std::vector get_choice_names() const = 0; + + virtual std::string getTypeDescr() const; + virtual LIBDE265_API bool processCmdLineArguments(char** argv, int* argc, int idx); + + const char** get_choices_string_table() const; + + protected: + void invalidate_choices_string_table() { + delete[] choice_string_table; + choice_string_table = NULL; + } + + private: + mutable char* choice_string_table; +}; + + +template class choice_option : public choice_option_base +{ + public: + choice_option() : default_set(false), value_set(false) { } + + // --- initialization --- + + void add_choice(const std::string& s, T id, bool default_value=false) { + choices.push_back( std::make_pair(s,id) ); + if (default_value) { + defaultID = id; + defaultValue = s; + default_set = true; + } + + invalidate_choices_string_table(); + } + + void set_default(T val) { +#ifdef FOR_LOOP_AUTO_SUPPORT + FOR_LOOP(auto, c, choices) { +#else + for (typename std::vector< std::pair >::const_iterator it=choices.begin(); it!=choices.end(); ++it) { + const std::pair & c = *it; +#endif + if (c.second == val) { + defaultID = val; + defaultValue = c.first; + default_set = true; + return; + } + } + + assert(false); // value does not exist + } + + + // --- usage --- + + bool set_value(const std::string& val) // returns false if it is not a valid option + { + value_set = true; + selectedValue=val; + + validValue = false; + +#ifdef FOR_LOOP_AUTO_SUPPORT + FOR_LOOP(auto, c, choices) { +#else + for (typename std::vector< std::pair >::const_iterator it=choices.begin(); it!=choices.end(); ++it) { + const std::pair & c = *it; +#endif + if (val == c.first) { + selectedID = c.second; + validValue = true; + } + } + + return validValue; + } + + bool isValidValue() const { return validValue; } + + const std::string& getValue() const { + assert(value_set || default_set); + return value_set ? selectedValue : defaultValue; + } + void setID(T id) { selectedID=id; validValue=true; } + const T getID() const { return value_set ? selectedID : defaultID; } + + virtual bool is_defined() const { return value_set || default_set; } + virtual bool has_default() const { return default_set; } + + std::vector get_choice_names() const + { + std::vector names; +#ifdef FOR_LOOP_AUTO_SUPPORT + FOR_LOOP(auto, p, choices) { +#else + for (typename std::vector< std::pair >::const_iterator it=choices.begin(); it!=choices.end(); ++it) { + const std::pair & p = *it; +#endif + names.push_back(p.first); + } + return names; + } + + std::string get_default_string() const { return defaultValue; } + + T operator() () const { return (T)getID(); } + + private: + std::vector< std::pair > choices; + + bool default_set; + std::string defaultValue; + T defaultID; + + bool value_set; + std::string selectedValue; + T selectedID; + + bool validValue; +}; + + + + +class config_parameters +{ + public: + config_parameters() : param_string_table(NULL) { } + ~config_parameters() { delete[] param_string_table; } + + void LIBDE265_API add_option(option_base* o); + + void LIBDE265_API print_params() const; + bool LIBDE265_API parse_command_line_params(int* argc, char** argv, int* first_idx=NULL, + bool ignore_unknown_options=false); + + + // --- connection to C API --- + + std::vector get_parameter_IDs() const; + enum en265_parameter_type get_parameter_type(const char* param) const; + + std::vector get_parameter_choices(const char* param) const; + + bool set_bool(const char* param, bool value); + bool set_int(const char* param, int value); + bool set_string(const char* param, const char* value); + bool set_choice(const char* param, const char* value); + + const char** get_parameter_string_table() const; + const char** get_parameter_choices_table(const char* param) const; + + private: + std::vector mOptions; + + option_base* find_option(const char* param) const; + + mutable char* param_string_table; +}; + +#endif diff --git a/libde265/contextmodel.h b/libde265/contextmodel.h new file mode 100644 index 0000000..cde83e1 --- /dev/null +++ b/libde265/contextmodel.h @@ -0,0 +1,130 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * Authors: struktur AG, Dirk Farin + * Min Chen + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef DE265_CONTEXTMODEL_H +#define DE265_CONTEXTMODEL_H + +#include "libde265/cabac.h" +#include "libde265/de265.h" + +#include +#include + + +struct context_model { + uint8_t MPSbit : 1; + uint8_t state : 7; + + bool operator==(context_model b) const { return state==b.state && MPSbit==b.MPSbit; } + bool operator!=(context_model b) const { return state!=b.state || MPSbit!=b.MPSbit; } +}; + + +enum context_model_index { + // SAO + CONTEXT_MODEL_SAO_MERGE_FLAG = 0, + CONTEXT_MODEL_SAO_TYPE_IDX = CONTEXT_MODEL_SAO_MERGE_FLAG +1, + + // CB-tree + CONTEXT_MODEL_SPLIT_CU_FLAG = CONTEXT_MODEL_SAO_TYPE_IDX + 1, + CONTEXT_MODEL_CU_SKIP_FLAG = CONTEXT_MODEL_SPLIT_CU_FLAG + 3, + + // intra-prediction + CONTEXT_MODEL_PART_MODE = CONTEXT_MODEL_CU_SKIP_FLAG + 3, + CONTEXT_MODEL_PREV_INTRA_LUMA_PRED_FLAG = CONTEXT_MODEL_PART_MODE + 4, + CONTEXT_MODEL_INTRA_CHROMA_PRED_MODE = CONTEXT_MODEL_PREV_INTRA_LUMA_PRED_FLAG + 1, + + // transform-tree + CONTEXT_MODEL_CBF_LUMA = CONTEXT_MODEL_INTRA_CHROMA_PRED_MODE + 1, + CONTEXT_MODEL_CBF_CHROMA = CONTEXT_MODEL_CBF_LUMA + 2, + CONTEXT_MODEL_SPLIT_TRANSFORM_FLAG = CONTEXT_MODEL_CBF_CHROMA + 4, + CONTEXT_MODEL_CU_CHROMA_QP_OFFSET_FLAG = CONTEXT_MODEL_SPLIT_TRANSFORM_FLAG + 3, + CONTEXT_MODEL_CU_CHROMA_QP_OFFSET_IDX = CONTEXT_MODEL_CU_CHROMA_QP_OFFSET_FLAG + 1, + + // residual + CONTEXT_MODEL_LAST_SIGNIFICANT_COEFFICIENT_X_PREFIX = CONTEXT_MODEL_CU_CHROMA_QP_OFFSET_IDX + 1, + CONTEXT_MODEL_LAST_SIGNIFICANT_COEFFICIENT_Y_PREFIX = CONTEXT_MODEL_LAST_SIGNIFICANT_COEFFICIENT_X_PREFIX + 18, + CONTEXT_MODEL_CODED_SUB_BLOCK_FLAG = CONTEXT_MODEL_LAST_SIGNIFICANT_COEFFICIENT_Y_PREFIX + 18, + CONTEXT_MODEL_SIGNIFICANT_COEFF_FLAG = CONTEXT_MODEL_CODED_SUB_BLOCK_FLAG + 4, + CONTEXT_MODEL_COEFF_ABS_LEVEL_GREATER1_FLAG = CONTEXT_MODEL_SIGNIFICANT_COEFF_FLAG + 42+2, + CONTEXT_MODEL_COEFF_ABS_LEVEL_GREATER2_FLAG = CONTEXT_MODEL_COEFF_ABS_LEVEL_GREATER1_FLAG + 24, + + CONTEXT_MODEL_CU_QP_DELTA_ABS = CONTEXT_MODEL_COEFF_ABS_LEVEL_GREATER2_FLAG + 6, + CONTEXT_MODEL_TRANSFORM_SKIP_FLAG = CONTEXT_MODEL_CU_QP_DELTA_ABS + 2, + CONTEXT_MODEL_RDPCM_FLAG = CONTEXT_MODEL_TRANSFORM_SKIP_FLAG + 2, + CONTEXT_MODEL_RDPCM_DIR = CONTEXT_MODEL_RDPCM_FLAG + 2, + + // motion + CONTEXT_MODEL_MERGE_FLAG = CONTEXT_MODEL_RDPCM_DIR + 2, + CONTEXT_MODEL_MERGE_IDX = CONTEXT_MODEL_MERGE_FLAG + 1, + CONTEXT_MODEL_PRED_MODE_FLAG = CONTEXT_MODEL_MERGE_IDX + 1, + CONTEXT_MODEL_ABS_MVD_GREATER01_FLAG = CONTEXT_MODEL_PRED_MODE_FLAG + 1, + CONTEXT_MODEL_MVP_LX_FLAG = CONTEXT_MODEL_ABS_MVD_GREATER01_FLAG + 2, + CONTEXT_MODEL_RQT_ROOT_CBF = CONTEXT_MODEL_MVP_LX_FLAG + 1, + CONTEXT_MODEL_REF_IDX_LX = CONTEXT_MODEL_RQT_ROOT_CBF + 1, + CONTEXT_MODEL_INTER_PRED_IDC = CONTEXT_MODEL_REF_IDX_LX + 2, + CONTEXT_MODEL_CU_TRANSQUANT_BYPASS_FLAG = CONTEXT_MODEL_INTER_PRED_IDC + 5, + CONTEXT_MODEL_LOG2_RES_SCALE_ABS_PLUS1 = CONTEXT_MODEL_CU_TRANSQUANT_BYPASS_FLAG + 1, + CONTEXT_MODEL_RES_SCALE_SIGN_FLAG = CONTEXT_MODEL_LOG2_RES_SCALE_ABS_PLUS1 + 8, + CONTEXT_MODEL_TABLE_LENGTH = CONTEXT_MODEL_RES_SCALE_SIGN_FLAG + 2 +}; + + + +void initialize_CABAC_models(context_model context_model_table[CONTEXT_MODEL_TABLE_LENGTH], + int initType, + int QPY); + + +class context_model_table +{ + public: + context_model_table(); + context_model_table(const context_model_table&); + ~context_model_table(); + + void init(int initType, int QPY); + void release(); + void decouple(); + context_model_table transfer(); + context_model_table copy() const { context_model_table t=*this; t.decouple(); return t; } + + bool empty() const { return refcnt != NULL; } + + context_model& operator[](int i) { return model[i]; } + + context_model_table& operator=(const context_model_table&); + + bool operator==(const context_model_table&) const; + + std::string debug_dump() const; + + private: + void decouple_or_alloc_with_empty_data(); + + context_model* model; // [CONTEXT_MODEL_TABLE_LENGTH] + int* refcnt; +}; + + +#endif diff --git a/libde265/de265-version.h b/libde265/de265-version.h new file mode 100644 index 0000000..0e22cbe --- /dev/null +++ b/libde265/de265-version.h @@ -0,0 +1,36 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +/* de265-version.h + * + * This file was generated by autoconf when libde265 was built. + * + * DO NOT EDIT THIS FILE. + */ +#ifndef LIBDE265_VERSION_H +#define LIBDE265_VERSION_H + +/* Numeric representation of the version */ +#define LIBDE265_NUMERIC_VERSION 0x01000500 + +/* Version string */ +#define LIBDE265_VERSION "1.0.5" + +#endif diff --git a/libde265/de265.h b/libde265/de265.h new file mode 100644 index 0000000..6481d8f --- /dev/null +++ b/libde265/de265.h @@ -0,0 +1,437 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . +*/ + + +#ifndef DE265_H +#define DE265_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +//#define inline static __inline + + +#ifndef __STDC_LIMIT_MACROS +#define __STDC_LIMIT_MACROS 1 +#endif +#include + +#if defined(_MSC_VER) && !defined(LIBDE265_STATIC_BUILD) + #ifdef LIBDE265_EXPORTS + #define LIBDE265_API __declspec(dllexport) + #else + #define LIBDE265_API __declspec(dllimport) + #endif +#elif HAVE_VISIBILITY + #ifdef LIBDE265_EXPORTS + #define LIBDE265_API __attribute__((__visibility__("default"))) + #else + #define LIBDE265_API + #endif +#else + #define LIBDE265_API +#endif + +#if __GNUC__ +#define LIBDE265_DEPRECATED __attribute__((deprecated)) +#elif defined(_MSC_VER) +#define LIBDE265_DEPRECATED __declspec(deprecated) +#else +#define LIBDE265_DEPRECATED +#endif + +#if defined(_MSC_VER) +#define LIBDE265_INLINE __inline +#else +#define LIBDE265_INLINE inline +#endif + +/* === version numbers === */ + +// version of linked libde265 library +LIBDE265_API const char *de265_get_version(void); +LIBDE265_API uint32_t de265_get_version_number(void); + +LIBDE265_API int de265_get_version_number_major(void); +LIBDE265_API int de265_get_version_number_minor(void); +LIBDE265_API int de265_get_version_number_maintenance(void); + + +/* === error codes === */ + +typedef enum { + DE265_OK = 0, + DE265_ERROR_NO_SUCH_FILE=1, + //DE265_ERROR_NO_STARTCODE=2, obsolet + //DE265_ERROR_EOF=3, + DE265_ERROR_COEFFICIENT_OUT_OF_IMAGE_BOUNDS=4, + DE265_ERROR_CHECKSUM_MISMATCH=5, + DE265_ERROR_CTB_OUTSIDE_IMAGE_AREA=6, + DE265_ERROR_OUT_OF_MEMORY=7, + DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE=8, + DE265_ERROR_IMAGE_BUFFER_FULL=9, + DE265_ERROR_CANNOT_START_THREADPOOL=10, + DE265_ERROR_LIBRARY_INITIALIZATION_FAILED=11, + DE265_ERROR_LIBRARY_NOT_INITIALIZED=12, + DE265_ERROR_WAITING_FOR_INPUT_DATA=13, + DE265_ERROR_CANNOT_PROCESS_SEI=14, + DE265_ERROR_PARAMETER_PARSING=15, + DE265_ERROR_NO_INITIAL_SLICE_HEADER=16, + DE265_ERROR_PREMATURE_END_OF_SLICE=17, + DE265_ERROR_UNSPECIFIED_DECODING_ERROR=18, + + // --- errors that should become obsolete in later libde265 versions --- + + //DE265_ERROR_MAX_THREAD_CONTEXTS_EXCEEDED = 500, obsolet + //DE265_ERROR_MAX_NUMBER_OF_SLICES_EXCEEDED = 501, obsolet + DE265_ERROR_NOT_IMPLEMENTED_YET = 502, + //DE265_ERROR_SCALING_LIST_NOT_IMPLEMENTED = 502, obsolet + + // --- warnings --- + + DE265_WARNING_NO_WPP_CANNOT_USE_MULTITHREADING = 1000, + DE265_WARNING_WARNING_BUFFER_FULL=1001, + DE265_WARNING_PREMATURE_END_OF_SLICE_SEGMENT=1002, + DE265_WARNING_INCORRECT_ENTRY_POINT_OFFSET=1003, + DE265_WARNING_CTB_OUTSIDE_IMAGE_AREA=1004, + DE265_WARNING_SPS_HEADER_INVALID=1005, + DE265_WARNING_PPS_HEADER_INVALID=1006, + DE265_WARNING_SLICEHEADER_INVALID=1007, + DE265_WARNING_INCORRECT_MOTION_VECTOR_SCALING=1008, + DE265_WARNING_NONEXISTING_PPS_REFERENCED=1009, + DE265_WARNING_NONEXISTING_SPS_REFERENCED=1010, + DE265_WARNING_BOTH_PREDFLAGS_ZERO=1011, + DE265_WARNING_NONEXISTING_REFERENCE_PICTURE_ACCESSED=1012, + DE265_WARNING_NUMMVP_NOT_EQUAL_TO_NUMMVQ=1013, + DE265_WARNING_NUMBER_OF_SHORT_TERM_REF_PIC_SETS_OUT_OF_RANGE=1014, + DE265_WARNING_SHORT_TERM_REF_PIC_SET_OUT_OF_RANGE=1015, + DE265_WARNING_FAULTY_REFERENCE_PICTURE_LIST=1016, + DE265_WARNING_EOSS_BIT_NOT_SET=1017, + DE265_WARNING_MAX_NUM_REF_PICS_EXCEEDED=1018, + DE265_WARNING_INVALID_CHROMA_FORMAT=1019, + DE265_WARNING_SLICE_SEGMENT_ADDRESS_INVALID=1020, + DE265_WARNING_DEPENDENT_SLICE_WITH_ADDRESS_ZERO=1021, + DE265_WARNING_NUMBER_OF_THREADS_LIMITED_TO_MAXIMUM=1022, + DE265_NON_EXISTING_LT_REFERENCE_CANDIDATE_IN_SLICE_HEADER=1023, + DE265_WARNING_CANNOT_APPLY_SAO_OUT_OF_MEMORY=1024, + DE265_WARNING_SPS_MISSING_CANNOT_DECODE_SEI=1025, + DE265_WARNING_COLLOCATED_MOTION_VECTOR_OUTSIDE_IMAGE_AREA=1026 +} de265_error; + +LIBDE265_API const char* de265_get_error_text(de265_error err); + +/* Returns true, if 'err' is DE265_OK or a warning. + */ +LIBDE265_API int de265_isOK(de265_error err); + +LIBDE265_API void de265_disable_logging(); // DEPRECATED +LIBDE265_API void de265_set_verbosity(int level); + + +/* === image === */ + +/* The image is currently always 3-channel YCbCr, with 4:2:0 chroma. + But you may want to check the chroma format anyway for future compatibility. + */ + +struct de265_image; + +enum de265_chroma { + de265_chroma_mono=0, + de265_chroma_420=1, + de265_chroma_422=2, + de265_chroma_444=3 +}; + +typedef int64_t de265_PTS; + + +LIBDE265_API int de265_get_image_width(const struct de265_image*,int channel); +LIBDE265_API int de265_get_image_height(const struct de265_image*,int channel); +LIBDE265_API enum de265_chroma de265_get_chroma_format(const struct de265_image*); +LIBDE265_API int de265_get_bits_per_pixel(const struct de265_image*,int channel); +/* The |out_stride| is returned as "bytes per line" if a non-NULL parameter is given. */ +LIBDE265_API const uint8_t* de265_get_image_plane(const struct de265_image*, int channel, int* out_stride); +LIBDE265_API void* de265_get_image_plane_user_data(const struct de265_image*, int channel); +LIBDE265_API de265_PTS de265_get_image_PTS(const struct de265_image*); +LIBDE265_API void* de265_get_image_user_data(const struct de265_image*); +LIBDE265_API void de265_set_image_user_data(struct de265_image*, void *user_data); + +/* Get NAL-header information of this frame. You can pass in NULL pointers if you + do not need this piece of information. + */ +LIBDE265_API void de265_get_image_NAL_header(const struct de265_image*, + int* nal_unit_type, + const char** nal_unit_name, // textual description of 'nal_unit_type' + int* nuh_layer_id, + int* nuh_temporal_id); + + +/* === decoder === */ + +typedef void de265_decoder_context; // private structure + + + +/* Get a new decoder context. Must be freed with de265_free_decoder(). */ +LIBDE265_API de265_decoder_context* de265_new_decoder(void); + +/* Initialize background decoding threads. If this function is not called, + all decoding is done in the main thread (no multi-threading). */ +LIBDE265_API de265_error de265_start_worker_threads(de265_decoder_context*, int number_of_threads); + +/* Free decoder context. May only be called once on a context. */ +LIBDE265_API de265_error de265_free_decoder(de265_decoder_context*); + +#ifndef LIBDE265_DISABLE_DEPRECATED +/* Push more data into the decoder, must be raw h265. + All complete images in the data will be decoded, hence, do not push + too much data at once to prevent image buffer overflows. + The end of a picture can only be detected when the succeeding start-code + is read from the data. + If you want to flush the data and force decoding of the data so far + (e.g. at the end of a file), call de265_decode_data() with 'length' zero. + + NOTE: This method is deprecated and will be removed in a future version. + You should use "de265_push_data" or "de265_push_NAL" and "de265_decode" + instead. +*/ +LIBDE265_API LIBDE265_DEPRECATED de265_error de265_decode_data(de265_decoder_context*, const void* data, int length); +#endif + +/* Push more data into the decoder, must be a raw h265 bytestream with startcodes. + The PTS is assigned to all NALs whose start-code 0x000001 is contained in the data. + The bytestream must contain all stuffing-bytes. + This function only pushes data into the decoder, nothing will be decoded. +*/ +LIBDE265_API de265_error de265_push_data(de265_decoder_context*, const void* data, int length, + de265_PTS pts, void* user_data); + +/* Indicate that de265_push_data has just received data until the end of a NAL. + The remaining pending input data is put into a NAL package and forwarded to the decoder. +*/ +LIBDE265_API void de265_push_end_of_NAL(de265_decoder_context*); + +/* Indicate that de265_push_data has just received data until the end of a frame. + All data pending at the decoder input will be pushed into the decoder and + the decoded picture is pushed to the output queue. +*/ +LIBDE265_API void de265_push_end_of_frame(de265_decoder_context*); + +/* Push a complete NAL unit without startcode into the decoder. The data must still + contain all stuffing-bytes. + This function only pushes data into the decoder, nothing will be decoded. +*/ +LIBDE265_API de265_error de265_push_NAL(de265_decoder_context*, const void* data, int length, + de265_PTS pts, void* user_data); + +/* Indicate the end-of-stream. All data pending at the decoder input will be + pushed into the decoder and the decoded picture queue will be completely emptied. + */ +LIBDE265_API de265_error de265_flush_data(de265_decoder_context*); + +/* Return number of bytes pending at the decoder input. + Can be used to avoid overflowing the decoder with too much data. + */ +LIBDE265_API int de265_get_number_of_input_bytes_pending(de265_decoder_context*); + +/* Return number of NAL units pending at the decoder input. + Can be used to avoid overflowing the decoder with too much data. + */ +LIBDE265_API int de265_get_number_of_NAL_units_pending(de265_decoder_context*); + +/* Do some decoding. Returns status whether it did perform some decoding or + why it could not do so. If 'more' is non-null, indicates whether de265_decode() + should be called again (possibly after resolving the indicated problem). + DE265_OK - decoding ok + DE265_ERROR_IMAGE_BUFFER_FULL - DPB full, extract some images before continuing + DE265_ERROR_WAITING_FOR_INPUT_DATA - insert more data before continuing + + You have to consider these cases: + - decoding successful -> err = DE265_OK, more=true + - decoding stalled -> err != DE265_OK, more=true + - decoding finished -> err = DE265_OK, more=false + - unresolvable error -> err != DE265_OK, more=false + */ +LIBDE265_API de265_error de265_decode(de265_decoder_context*, int* more); + +/* Clear decoder state. Call this when skipping in the stream. + */ +LIBDE265_API void de265_reset(de265_decoder_context*); + +/* Return next decoded picture, if there is any. If no complete picture has been + decoded yet, NULL is returned. You should call de265_release_next_picture() to + advance to the next picture. */ +LIBDE265_API const struct de265_image* de265_peek_next_picture(de265_decoder_context*); // may return NULL + +/* Get next decoded picture and remove this picture from the decoder output queue. + Returns NULL is there is no decoded picture ready. + You can use the picture only until you call any other de265_* function. */ +LIBDE265_API const struct de265_image* de265_get_next_picture(de265_decoder_context*); // may return NULL + +/* Release the current decoded picture for reuse in the decoder. You should not + use the data anymore after calling this function. */ +LIBDE265_API void de265_release_next_picture(de265_decoder_context*); + + +LIBDE265_API de265_error de265_get_warning(de265_decoder_context*); + + +enum de265_image_format { + de265_image_format_mono8 = 1, + de265_image_format_YUV420P8 = 2, + de265_image_format_YUV422P8 = 3, + de265_image_format_YUV444P8 = 4 +}; + +struct de265_image_spec +{ + enum de265_image_format format; + int width; + int height; + int alignment; + + // conformance window + + int crop_left; + int crop_right; + int crop_top; + int crop_bottom; + + int visible_width; // convenience, width - crop_left - crop_right + int visible_height; // convenience, height - crop_top - crop_bottom +}; + +struct de265_image_allocation +{ + int (*get_buffer)(de265_decoder_context* ctx, // first parameter deprecated + struct de265_image_spec* spec, + struct de265_image* img, + void* userdata); + void (*release_buffer)(de265_decoder_context* ctx, // first parameter deprecated + struct de265_image* img, + void* userdata); +}; + +/* The user data pointer will be given to the get_buffer() and release_buffer() functions + in de265_image_allocation. */ +LIBDE265_API void de265_set_image_allocation_functions(de265_decoder_context*, + struct de265_image_allocation*, + void* userdata); +LIBDE265_API const struct de265_image_allocation *de265_get_default_image_allocation_functions(void); + +LIBDE265_API void de265_set_image_plane(struct de265_image* img, int cIdx, void* mem, int stride, void *userdata); + + +/* --- frame dropping API --- + + To limit decoding to a maximum temporal layer (TID), use de265_set_limit_TID(). + The maximum layer ID in the stream can be queried with de265_get_highest_TID(). + Note that the maximum layer ID can change throughout the stream. + + For a fine-grained selection of the frame-rate, use de265_set_framerate_ratio(). + A percentage of 100% will decode all frames in all temporal layers. A lower percentage + will drop approximately as many frames. Note that this only accurate if the frames + are distributed evenly among the layers. Otherwise, the mapping is non-linear. + + The limit_TID has a higher precedence than framerate_ratio. Hence, setting a higher + framerate-ratio will decode at limit_TID without dropping. + + With change_framerate(), the output frame-rate can be increased/decreased to some + discrete preferable values. Currently, these are non-dropped decoding at various + TID layers. +*/ + +LIBDE265_API int de265_get_highest_TID(de265_decoder_context*); // highest temporal substream to decode +LIBDE265_API int de265_get_current_TID(de265_decoder_context*); // currently decoded temporal substream + +LIBDE265_API void de265_set_limit_TID(de265_decoder_context*,int max_tid); // highest temporal substream to decode +LIBDE265_API void de265_set_framerate_ratio(de265_decoder_context*,int percent); // percentage of frames to decode (approx) +LIBDE265_API int de265_change_framerate(de265_decoder_context*,int more_vs_less); // 1: more, -1: less, returns corresponding framerate_ratio + + +/* --- decoding parameters --- */ + +enum de265_param { + DE265_DECODER_PARAM_BOOL_SEI_CHECK_HASH=0, // (bool) Perform SEI hash check on decoded pictures. + DE265_DECODER_PARAM_DUMP_SPS_HEADERS=1, // (int) Dump headers to specified file-descriptor. + DE265_DECODER_PARAM_DUMP_VPS_HEADERS=2, + DE265_DECODER_PARAM_DUMP_PPS_HEADERS=3, + DE265_DECODER_PARAM_DUMP_SLICE_HEADERS=4, + DE265_DECODER_PARAM_ACCELERATION_CODE=5, // (int) enum de265_acceleration, default: AUTO + DE265_DECODER_PARAM_SUPPRESS_FAULTY_PICTURES=6, // (bool) do not output frames with decoding errors, default: no (output all images) + + DE265_DECODER_PARAM_DISABLE_DEBLOCKING=7, // (bool) disable deblocking + DE265_DECODER_PARAM_DISABLE_SAO=8 // (bool) disable SAO filter + //DE265_DECODER_PARAM_DISABLE_MC_RESIDUAL_IDCT=9, // (bool) disable decoding of IDCT residuals in MC blocks + //DE265_DECODER_PARAM_DISABLE_INTRA_RESIDUAL_IDCT=10 // (bool) disable decoding of IDCT residuals in MC blocks +}; + +// sorted such that a large ID includes all optimizations from lower IDs +enum de265_acceleration { + de265_acceleration_SCALAR = 0, // only fallback implementation + de265_acceleration_MMX = 10, + de265_acceleration_SSE = 20, + de265_acceleration_SSE2 = 30, + de265_acceleration_SSE4 = 40, + de265_acceleration_AVX = 50, // not implemented yet + de265_acceleration_AVX2 = 60, // not implemented yet + de265_acceleration_ARM = 70, + de265_acceleration_NEON = 80, + de265_acceleration_AUTO = 10000 +}; + + +/* Set decoding parameters. */ +LIBDE265_API void de265_set_parameter_bool(de265_decoder_context*, enum de265_param param, int value); + +LIBDE265_API void de265_set_parameter_int(de265_decoder_context*, enum de265_param param, int value); + +/* Get decoding parameters. */ +LIBDE265_API int de265_get_parameter_bool(de265_decoder_context*, enum de265_param param); + + + +/* --- optional library initialization --- */ + +/* Static library initialization. Must be paired with de265_free(). + Initialization is optional, since it will be done implicitly in de265_new_decoder(). + Return value is false if initialization failed. + Only call de265_free() when initialization was successful. + Multiple calls to 'init' are allowed, but must be matched with an equal number of 'free' calls. +*/ +LIBDE265_API de265_error de265_init(void); + +/* Free global library data. + An implicit free call is made in de265_free_decoder(). + Returns false if library was not initialized before, or if 'free' was called + more often than 'init'. + */ +LIBDE265_API de265_error de265_free(void); + + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/libde265/deblock.h b/libde265/deblock.h new file mode 100644 index 0000000..b8f3781 --- /dev/null +++ b/libde265/deblock.h @@ -0,0 +1,29 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef DE265_DEBLOCK_H +#define DE265_DEBLOCK_H + +#include "libde265/decctx.h" + +void add_deblocking_tasks(image_unit* imgunit); +void apply_deblocking_filter(de265_image* img); //decoder_context* ctx); + +#endif diff --git a/libde265/decctx.h b/libde265/decctx.h new file mode 100644 index 0000000..c1acdce --- /dev/null +++ b/libde265/decctx.h @@ -0,0 +1,528 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef DE265_DECCTX_H +#define DE265_DECCTX_H + +#include "libde265/vps.h" +#include "libde265/sps.h" +#include "libde265/pps.h" +#include "libde265/nal.h" +#include "libde265/slice.h" +#include "libde265/image.h" +#include "libde265/motion.h" +#include "libde265/de265.h" +#include "libde265/dpb.h" +#include "libde265/sei.h" +#include "libde265/threads.h" +#include "libde265/acceleration.h" +#include "libde265/nal-parser.h" + +#include + +#define DE265_MAX_VPS_SETS 16 // this is the maximum as defined in the standard +#define DE265_MAX_SPS_SETS 16 // this is the maximum as defined in the standard +#define DE265_MAX_PPS_SETS 64 // this is the maximum as defined in the standard + +#define MAX_WARNINGS 20 + + +class slice_segment_header; +class image_unit; +class slice_unit; +class decoder_context; + + +class thread_context +{ +public: + thread_context(); + + int CtbAddrInRS; + int CtbAddrInTS; + + int CtbX, CtbY; + + + // motion vectors + + PBMotionCoding motion; + + + // prediction + + // enum IntraPredMode IntraPredModeC[4]; // chroma intra-prediction mode for current CB + int ResScaleVal; + + + // residual data + + uint8_t cu_transquant_bypass_flag; + uint8_t transform_skip_flag[3]; + uint8_t explicit_rdpcm_flag; + uint8_t explicit_rdpcm_dir; + + // we need 16 bytes of extra memory (8*int16) to shift the base for the + // alignment required for SSE code ! + int16_t _coeffBuf[(32*32)+8]; + int16_t *coeffBuf; // the base pointer for into _coeffBuf, aligned to 16 bytes + + int16_t coeffList[3][32*32]; + int16_t coeffPos[3][32*32]; + int16_t nCoeff[3]; + + int32_t residual_luma[32*32]; // only used when cross-comp-prediction is enabled + + + // quantization + + int IsCuQpDeltaCoded; + int CuQpDelta; + int IsCuChromaQpOffsetCoded; + int CuQpOffsetCb, CuQpOffsetCr; + + int currentQPY; + int currentQG_x, currentQG_y; + int lastQPYinPreviousQG; + + int qPYPrime, qPCbPrime, qPCrPrime; + + CABAC_decoder cabac_decoder; + + context_model_table ctx_model; + uint8_t StatCoeff[4]; + + decoder_context* decctx; + struct de265_image *img; + slice_segment_header* shdr; + + image_unit* imgunit; + slice_unit* sliceunit; + thread_task* task; // executing thread_task or NULL if not multi-threaded + +private: + thread_context(const thread_context&); // not allowed + const thread_context& operator=(const thread_context&); // not allowed +}; + + + +class error_queue +{ + public: + error_queue(); + + void add_warning(de265_error warning, bool once); + de265_error get_warning(); + + private: + de265_error warnings[MAX_WARNINGS]; + int nWarnings; + de265_error warnings_shown[MAX_WARNINGS]; // warnings that have already occurred + int nWarningsShown; +}; + + + +class slice_unit +{ +public: + slice_unit(decoder_context* decctx); + ~slice_unit(); + + NAL_unit* nal; // we are the owner + slice_segment_header* shdr; // not the owner (de265_image is owner) + bitreader reader; + + image_unit* imgunit; + + bool flush_reorder_buffer; + + + // decoding status + + enum SliceDecodingProgress { Unprocessed, + InProgress, + Decoded + } state; + + de265_progress_lock finished_threads; + int nThreads; + + int first_decoded_CTB_RS; // TODO + int last_decoded_CTB_RS; // TODO + + void allocate_thread_contexts(int n); + thread_context* get_thread_context(int n) { + assert(n < nThreadContexts); + return &thread_contexts[n]; + } + int num_thread_contexts() const { return nThreadContexts; } + +private: + thread_context* thread_contexts; /* NOTE: cannot use std::vector, because thread_context has + no copy constructor. */ + int nThreadContexts; + +public: + decoder_context* ctx; + +private: + slice_unit(const slice_unit&); // not allowed + const slice_unit& operator=(const slice_unit&); // not allowed +}; + + +class image_unit +{ +public: + image_unit(); + ~image_unit(); + + de265_image* img; + de265_image sao_output; // if SAO is used, this is allocated and used as SAO output buffer + + std::vector slice_units; + std::vector suffix_SEIs; + + slice_unit* get_next_unprocessed_slice_segment() const { + for (int i=0;istate == slice_unit::Unprocessed) { + return slice_units[i]; + } + } + + return NULL; + } + + slice_unit* get_prev_slice_segment(slice_unit* s) const { + for (int i=1; istate != slice_unit::Unprocessed) return true; + return false; + } + + bool is_first_slice_segment(const slice_unit* s) const { + if (slice_units.size()==0) return false; + return (slice_units[0] == s); + } + + enum { Invalid, // headers not read yet + Unknown, // SPS/PPS available + Reference, // will be used as reference + Leaf // not a reference picture + } role; + + enum { Unprocessed, + InProgress, + Decoded, + Dropped // will not be decoded + } state; + + std::vector tasks; // we are the owner + + /* Saved context models for WPP. + There is one saved model for the initialization of each CTB row. + The array is unused for non-WPP streams. */ + std::vector ctx_models; // TODO: move this into image ? +}; + + +class base_context : public error_queue +{ + public: + base_context(); + virtual ~base_context() { } + + // --- accelerated DSP functions --- + + void set_acceleration_functions(enum de265_acceleration); + + struct acceleration_functions acceleration; // CPU optimized functions + + //virtual /* */ de265_image* get_image(int dpb_index) { return dpb.get_image(dpb_index); } + virtual const de265_image* get_image(int frame_id) const = 0; + virtual bool has_image(int frame_id) const = 0; +}; + + +class decoder_context : public base_context { + public: + decoder_context(); + ~decoder_context(); + + de265_error start_thread_pool(int nThreads); + void stop_thread_pool(); + + void reset(); + + bool has_sps(int id) const { return (bool)sps[id]; } + bool has_pps(int id) const { return (bool)pps[id]; } + + std::shared_ptr get_shared_sps(int id) { return sps[id]; } + std::shared_ptr get_shared_pps(int id) { return pps[id]; } + + /* */ seq_parameter_set* get_sps(int id) { return sps[id].get(); } + const seq_parameter_set* get_sps(int id) const { return sps[id].get(); } + /* */ pic_parameter_set* get_pps(int id) { return pps[id].get(); } + const pic_parameter_set* get_pps(int id) const { return pps[id].get(); } + + /* + const slice_segment_header* get_SliceHeader_atCtb(int ctb) { + return img->slices[img->get_SliceHeaderIndex_atIndex(ctb)]; + } + */ + + uint8_t get_nal_unit_type() const { return nal_unit_type; } + bool get_RapPicFlag() const { return RapPicFlag; } + + de265_error decode_NAL(NAL_unit* nal); + + de265_error decode(int* more); + de265_error decode_some(bool* did_work); + + de265_error decode_slice_unit_sequential(image_unit* imgunit, slice_unit* sliceunit); + de265_error decode_slice_unit_parallel(image_unit* imgunit, slice_unit* sliceunit); + de265_error decode_slice_unit_WPP(image_unit* imgunit, slice_unit* sliceunit); + de265_error decode_slice_unit_tiles(image_unit* imgunit, slice_unit* sliceunit); + + + void process_nal_hdr(nal_header*); + + bool process_slice_segment_header(slice_segment_header*, + de265_error*, de265_PTS pts, + nal_header* nal_hdr, void* user_data); + + //void push_current_picture_to_output_queue(); + de265_error push_picture_to_output_queue(image_unit*); + + + // --- parameters --- + + bool param_sei_check_hash; + bool param_conceal_stream_errors; + bool param_suppress_faulty_pictures; + + int param_sps_headers_fd; + int param_vps_headers_fd; + int param_pps_headers_fd; + int param_slice_headers_fd; + + bool param_disable_deblocking; + bool param_disable_sao; + //bool param_disable_mc_residual_idct; // not implemented yet + //bool param_disable_intra_residual_idct; // not implemented yet + + void set_image_allocation_functions(de265_image_allocation* allocfunc, void* userdata); + + de265_image_allocation param_image_allocation_functions; + void* param_image_allocation_userdata; + + + // --- input stream data --- + + NAL_Parser nal_parser; + + + int get_num_worker_threads() const { return num_worker_threads; } + + /* */ de265_image* get_image(int dpb_index) { return dpb.get_image(dpb_index); } + const de265_image* get_image(int dpb_index) const { return dpb.get_image(dpb_index); } + + bool has_image(int dpb_index) const { return dpb_index>=0 && dpb_index vps[ DE265_MAX_VPS_SETS ]; + std::shared_ptr sps[ DE265_MAX_SPS_SETS ]; + std::shared_ptr pps[ DE265_MAX_PPS_SETS ]; + + std::shared_ptr current_vps; + std::shared_ptr current_sps; + std::shared_ptr current_pps; + + public: + thread_pool thread_pool_; + + private: + int num_worker_threads; + + + public: + // --- frame dropping --- + + void set_limit_TID(int tid); + int get_highest_TID() const; + int get_current_TID() const { return current_HighestTid; } + int change_framerate(int more_vs_less); // 1: more, -1: less + void set_framerate_ratio(int percent); + + private: + // input parameters + int limit_HighestTid; // never switch to a layer above this one + int framerate_ratio; + + // current control parameters + int goal_HighestTid; // this is the layer we want to decode at + int layer_framerate_ratio; // ratio of frames to keep in the current layer + + int current_HighestTid; // the layer which we are currently decoding + + struct { + int8_t tid; + int8_t ratio; + } framedrop_tab[100+1]; + int framedrop_tid_index[6+1]; + + void compute_framedrop_table(); + void calc_tid_and_framerate_ratio(); + + private: + // --- decoded picture buffer --- + + decoded_picture_buffer dpb; + + int current_image_poc_lsb; + bool first_decoded_picture; + bool NoRaslOutputFlag; + bool HandleCraAsBlaFlag; + bool FirstAfterEndOfSequenceNAL; + + int PicOrderCntMsb; + int prevPicOrderCntLsb; // at precTid0Pic + int prevPicOrderCntMsb; // at precTid0Pic + + de265_image* img; + + public: + const slice_segment_header* previous_slice_header; /* Remember the last slice for a successive + dependent slice. */ + + + // --- motion compensation --- + + public: + int PocLsbLt[MAX_NUM_REF_PICS]; + int UsedByCurrPicLt[MAX_NUM_REF_PICS]; + int DeltaPocMsbCycleLt[MAX_NUM_REF_PICS]; + private: + int CurrDeltaPocMsbPresentFlag[MAX_NUM_REF_PICS]; + int FollDeltaPocMsbPresentFlag[MAX_NUM_REF_PICS]; + + // The number of entries in the lists below. + int NumPocStCurrBefore; + int NumPocStCurrAfter; + int NumPocStFoll; + int NumPocLtCurr; + int NumPocLtFoll; + + // These lists contain absolute POC values. + int PocStCurrBefore[MAX_NUM_REF_PICS]; // used for reference in current picture, smaller POC + int PocStCurrAfter[MAX_NUM_REF_PICS]; // used for reference in current picture, larger POC + int PocStFoll[MAX_NUM_REF_PICS]; // not used for reference in current picture, but in future picture + int PocLtCurr[MAX_NUM_REF_PICS]; // used in current picture + int PocLtFoll[MAX_NUM_REF_PICS]; // used in some future picture + + // These lists contain indices into the DPB. + int RefPicSetStCurrBefore[MAX_NUM_REF_PICS]; + int RefPicSetStCurrAfter[MAX_NUM_REF_PICS]; + int RefPicSetStFoll[MAX_NUM_REF_PICS]; + int RefPicSetLtCurr[MAX_NUM_REF_PICS]; + int RefPicSetLtFoll[MAX_NUM_REF_PICS]; + + + // --- parameters derived from parameter sets --- + + // NAL + + uint8_t nal_unit_type; + + char IdrPicFlag; + char RapPicFlag; + + + // --- image unit queue --- + + std::vector image_units; + + bool flush_reorder_buffer_at_this_frame; + + private: + void init_thread_context(thread_context* tctx); + void add_task_decode_CTB_row(thread_context* tctx, bool firstSliceSubstream, int ctbRow); + void add_task_decode_slice_segment(thread_context* tctx, bool firstSliceSubstream, + int ctbX,int ctbY); + + void mark_whole_slice_as_processed(image_unit* imgunit, + slice_unit* sliceunit, + int progress); + + void process_picture_order_count(slice_segment_header* hdr); + int generate_unavailable_reference_picture(const seq_parameter_set* sps, + int POC, bool longTerm); + void process_reference_picture_set(slice_segment_header* hdr); + bool construct_reference_picture_lists(slice_segment_header* hdr); + + + void remove_images_from_dpb(const std::vector& removeImageList); + void run_postprocessing_filters_sequential(struct de265_image* img); + void run_postprocessing_filters_parallel(image_unit* img); +}; + + +#endif diff --git a/libde265/dpb.h b/libde265/dpb.h new file mode 100644 index 0000000..fa8ff59 --- /dev/null +++ b/libde265/dpb.h @@ -0,0 +1,118 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef DE265_DPB_H +#define DE265_DPB_H + +#include "libde265/image.h" +#include "libde265/sps.h" + +#include +#include + +class decoder_context; + +class decoded_picture_buffer { +public: + decoded_picture_buffer(); + ~decoded_picture_buffer(); + + void set_max_size_of_DPB(int n) { max_images_in_DPB=n; } + void set_norm_size_of_DPB(int n) { norm_images_in_DPB=n; } + + /* Alloc a new image in the DPB and return its index. + If there is no space for a new image, return -1. */ + int new_image(std::shared_ptr sps, decoder_context* decctx, + de265_PTS pts, void* user_data, bool isOutputImage); + + /* Check for a free slot in the DPB. There are some slots reserved for + unavailable reference frames. If high_priority==true, these reserved slots + are included in the check. */ + bool has_free_dpb_picture(bool high_priority) const; + + /* Remove all pictures from DPB and queues. Decoding should be stopped while calling this. */ + void clear(); + + int size() const { return dpb.size(); } + + /* Raw access to the images. */ + + /* */ de265_image* get_image(int index) { + if (index>=dpb.size()) return NULL; + return dpb[index]; + } + + const de265_image* get_image(int index) const { + if (index>=dpb.size()) return NULL; + return dpb[index]; + } + + /* Search DPB for the slot index of a specific picture. */ + int DPB_index_of_picture_with_POC(int poc, int currentID, bool preferLongTerm=false) const; + int DPB_index_of_picture_with_LSB(int lsb, int currentID, bool preferLongTerm=false) const; + int DPB_index_of_picture_with_ID (int id) const; + + + // --- reorder buffer --- + + void insert_image_into_reorder_buffer(struct de265_image* img) { + reorder_output_queue.push_back(img); + } + + int num_pictures_in_reorder_buffer() const { return reorder_output_queue.size(); } + + // move next picture in reorder buffer to output queue + void output_next_picture_in_reorder_buffer(); + + // Move all pictures in reorder buffer to output buffer. Return true if there were any pictures. + bool flush_reorder_buffer(); + + + // --- output buffer --- + + int num_pictures_in_output_queue() const { return image_output_queue.size(); } + + /* Get the next picture in the output queue, but do not remove it from the queue. */ + struct de265_image* get_next_picture_in_output_queue() const { return image_output_queue.front(); } + + /* Remove the next picture in the output queue. */ + void pop_next_picture_in_output_queue(); + + + // --- debug --- + + void log_dpb_content() const; + void log_dpb_queues() const; + +private: + int max_images_in_DPB; + int norm_images_in_DPB; + + std::vector dpb; // decoded picture buffer + + std::vector reorder_output_queue; + std::deque image_output_queue; + +private: + decoded_picture_buffer(const decoded_picture_buffer&); // no copy + decoded_picture_buffer& operator=(const decoded_picture_buffer&); // no copy +}; + +#endif diff --git a/libde265/en265.h b/libde265/en265.h new file mode 100644 index 0000000..a22e5d1 --- /dev/null +++ b/libde265/en265.h @@ -0,0 +1,218 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * Authors: Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef EN265_H +#define EN265_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include + + +// ========== encoder context ========== + +struct en265_encoder_context; // private structure + +/* Get a new encoder context. Must be freed with en265_free_encoder(). */ +LIBDE265_API en265_encoder_context* en265_new_encoder(void); + +/* Free encoder context. May only be called once on a context. */ +LIBDE265_API de265_error en265_free_encoder(en265_encoder_context*); + +/* The alloc_userdata pointer will be given to the release_func(). */ +/* +LIBDE265_API void en265_set_image_release_function(en265_encoder_context*, + void (*release_func)(en265_encoder_context*, + struct de265_image*, + void* userdata), + void* alloc_userdata); +*/ + +// ========== encoder parameters ========== + +LIBDE265_API de265_error en265_set_parameter_bool(en265_encoder_context*, + const char* parametername,int value); +LIBDE265_API de265_error en265_set_parameter_int(en265_encoder_context*, + const char* parametername,int value); +LIBDE265_API de265_error en265_set_parameter_string(en265_encoder_context*, + const char* parametername,const char* value); +LIBDE265_API de265_error en265_set_parameter_choice(en265_encoder_context*, + const char* parametername,const char* value); + + +LIBDE265_API const char** en265_list_parameters(en265_encoder_context*); + +enum en265_parameter_type { + en265_parameter_bool, + en265_parameter_int, + en265_parameter_string, + en265_parameter_choice +}; + +LIBDE265_API enum en265_parameter_type en265_get_parameter_type(en265_encoder_context*, + const char* parametername); + +LIBDE265_API const char** en265_list_parameter_choices(en265_encoder_context*, + const char* parametername); + + +// --- convenience functions for command-line parameters --- + +LIBDE265_API de265_error en265_parse_command_line_parameters(en265_encoder_context*, + int* argc, char** argv); +LIBDE265_API void en265_show_parameters(en265_encoder_context*); + + + +// ========== encoding loop ========== + +LIBDE265_API de265_error en265_start_encoder(en265_encoder_context*, int number_of_threads); + +// If we have provided our own memory release function, no image memory will be allocated. +LIBDE265_API struct de265_image* en265_allocate_image(en265_encoder_context*, + int width, int height, + enum de265_chroma chroma, + de265_PTS pts, void* image_userdata); + +LIBDE265_API void* de265_alloc_image_plane(struct de265_image* img, int cIdx, + void* inputdata, int inputstride, void *userdata); +LIBDE265_API void de265_free_image_plane(struct de265_image* img, int cIdx); + + +// Request a specification of the image memory layout for an image of the specified dimensions. +LIBDE265_API void en265_get_image_spec(en265_encoder_context*, + int width, int height, enum de265_chroma chroma, + struct de265_image_spec* out_spec); + +// Image memory layout specification for an image returned by en265_allocate_image(). +/* TODO: do we need this? +LIBDE265_API void de265_get_image_spec_from_image(de265_image* img, struct de265_image_spec* spec); +*/ + + +LIBDE265_API de265_error en265_push_image(en265_encoder_context*, + struct de265_image*); // non-blocking + +LIBDE265_API de265_error en265_push_eof(en265_encoder_context*); + +// block when there are more than max_input_images in the input queue +LIBDE265_API de265_error en265_block_on_input_queue_length(en265_encoder_context*, + int max_pending_images, + int timeout_ms); + +LIBDE265_API de265_error en265_trim_input_queue(en265_encoder_context*, int max_pending_images); + +LIBDE265_API int en265_current_input_queue_length(en265_encoder_context*); + +// Run encoder in main thread. Only use this when not using background threads. +LIBDE265_API de265_error en265_encode(en265_encoder_context*); + +enum en265_encoder_state +{ + EN265_STATE_IDLE, + EN265_STATE_WAITING_FOR_INPUT, + EN265_STATE_WORKING, + EN265_STATE_OUTPUT_QUEUE_FULL, + EN265_STATE_EOS +}; + + +LIBDE265_API enum en265_encoder_state en265_get_encoder_state(en265_encoder_context*); + + +enum en265_packet_content_type { + EN265_PACKET_VPS, + EN265_PACKET_SPS, + EN265_PACKET_PPS, + EN265_PACKET_SEI, + EN265_PACKET_SLICE, + EN265_PACKET_SKIPPED_IMAGE +}; + + +enum en265_nal_unit_type { + EN265_NUT_TRAIL_N = 0, + EN265_NUT_TRAIL_R = 1, + EN265_NUT_TSA_N = 2, + EN265_NUT_TSA_R = 3, + EN265_NUT_STSA_N = 4, + EN265_NUT_STSA_R = 5, + EN265_NUT_RADL_N = 6, + EN265_NUT_RADL_R = 7, + EN265_NUT_RASL_N = 8, + EN265_NUT_RASL_R = 9, + EN265_NUT_BLA_W_LP = 16, + EN265_NUT_BLA_W_RADL= 17, + EN265_NUT_BLA_N_LP = 18, + EN265_NUT_IDR_W_RADL= 19, + EN265_NUT_IDR_N_LP = 20, + EN265_NUT_CRA = 21, + EN265_NUT_VPS = 32, + EN265_NUT_SPS = 33, + EN265_NUT_PPS = 34, + EN265_NUT_AUD = 35, + EN265_NUT_EOS = 36, + EN265_NUT_EOB = 37, + EN265_NUT_FD = 38, + EN265_NUT_PREFIX_SEI = 39, + EN265_NUT_SUFFIX_SEI = 40 +}; + + +struct en265_packet +{ + int version; // currently: 1 + + const uint8_t* data; + int length; + + int frame_number; + + enum en265_packet_content_type content_type; + char complete_picture : 1; + char final_slice : 1; + char dependent_slice : 1; + + enum en265_nal_unit_type nal_unit_type; + unsigned char nuh_layer_id; + unsigned char nuh_temporal_id; + + en265_encoder_context* encoder_context; + + const struct de265_image* input_image; + const struct de265_image* reconstruction; +}; + +// timeout_ms - timeout in milliseconds. 0 - no timeout, -1 - block forever +LIBDE265_API struct en265_packet* en265_get_packet(en265_encoder_context*, int timeout_ms); +LIBDE265_API void en265_free_packet(en265_encoder_context*, struct en265_packet*); + +LIBDE265_API int en265_number_of_queued_packets(en265_encoder_context*); + +#ifdef __cplusplus +} +#endif + + +#endif diff --git a/libde265/fallback-dct.h b/libde265/fallback-dct.h new file mode 100644 index 0000000..83d25c1 --- /dev/null +++ b/libde265/fallback-dct.h @@ -0,0 +1,96 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef FALLBACK_DCT_H +#define FALLBACK_DCT_H + +#include +#include + +#include "util.h" + + +// --- decoding --- + +void transform_skip_8_fallback(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride); +void transform_bypass_fallback(int32_t *r, const int16_t *coeffs, int nT); + +void transform_skip_rdpcm_v_8_fallback(uint8_t *dst, const int16_t *coeffs, int nT, ptrdiff_t stride); +void transform_skip_rdpcm_h_8_fallback(uint8_t *dst, const int16_t *coeffs, int nT, ptrdiff_t stride); +void transform_bypass_rdpcm_v_fallback(int32_t *r, const int16_t *coeffs,int nT); +void transform_bypass_rdpcm_h_fallback(int32_t *r, const int16_t *coeffs,int nT); + +void transform_4x4_luma_add_8_fallback(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride); +void transform_4x4_add_8_fallback(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride); +void transform_8x8_add_8_fallback(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride); +void transform_16x16_add_8_fallback(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride); +void transform_32x32_add_8_fallback(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride); + + +void transform_skip_16_fallback(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth); +void transform_bypass_16_fallback(uint16_t *dst, const int16_t *coeffs, int nT, ptrdiff_t stride, int bit_depth); + +void transform_4x4_luma_add_16_fallback(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth); +void transform_4x4_add_16_fallback(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth); +void transform_8x8_add_16_fallback(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth); +void transform_16x16_add_16_fallback(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth); +void transform_32x32_add_16_fallback(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth); + +void rotate_coefficients_fallback(int16_t *coeff, int nT); + + +void transform_idst_4x4_fallback(int32_t *dst, const int16_t *coeffs, int bdShift, int max_coeff_bits); +void transform_idct_4x4_fallback(int32_t *dst, const int16_t *coeffs, int bdShift, int max_coeff_bits); +void transform_idct_8x8_fallback(int32_t *dst, const int16_t *coeffs, int bdShift, int max_coeff_bits); +void transform_idct_16x16_fallback(int32_t *dst, const int16_t *coeffs, int bdShift, int max_coeff_bits); +void transform_idct_32x32_fallback(int32_t *dst, const int16_t *coeffs, int bdShift, int max_coeff_bits); + +template +void add_residual_fallback(pixel_t *dst, ptrdiff_t stride, + const int32_t* r, int nT, int bit_depth) +{ + for (int y=0;y + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef FALLBACK_MOTION_H +#define FALLBACK_MOTION_H + +#include +#include + + +void put_weighted_pred_avg_8_fallback(uint8_t *dst, ptrdiff_t dststride, + const int16_t *src1, const int16_t *src2, + ptrdiff_t srcstride, int width, + int height); + +void put_unweighted_pred_8_fallback(uint8_t *_dst, ptrdiff_t dststride, + const int16_t *src, ptrdiff_t srcstride, + int width, int height); + +void put_weighted_pred_8_fallback(uint8_t *_dst, ptrdiff_t dststride, + const int16_t *src, ptrdiff_t srcstride, + int width, int height, + int w,int o,int log2WD); +void put_weighted_bipred_8_fallback(uint8_t *_dst, ptrdiff_t dststride, + const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride, + int width, int height, + int w1,int o1, int w2,int o2, int log2WD); + +void put_weighted_pred_avg_16_fallback(uint16_t *dst, ptrdiff_t dststride, + const int16_t *src1, const int16_t *src2, + ptrdiff_t srcstride, int width, + int height, int bit_depth); + +void put_unweighted_pred_16_fallback(uint16_t *_dst, ptrdiff_t dststride, + const int16_t *src, ptrdiff_t srcstride, + int width, int height, int bit_depth); + +void put_weighted_pred_16_fallback(uint16_t *_dst, ptrdiff_t dststride, + const int16_t *src, ptrdiff_t srcstride, + int width, int height, + int w,int o,int log2WD, int bit_depth); +void put_weighted_bipred_16_fallback(uint16_t *_dst, ptrdiff_t dststride, + const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride, + int width, int height, + int w1,int o1, int w2,int o2, int log2WD, int bit_depth); + + + +void put_epel_8_fallback(int16_t *dst, ptrdiff_t dststride, + const uint8_t *_src, ptrdiff_t srcstride, + int width, int height, + int mx, int my, int16_t* mcbuffer); + +void put_epel_16_fallback(int16_t *out, ptrdiff_t out_stride, + const uint16_t *src, ptrdiff_t src_stride, + int width, int height, + int mx, int my, int16_t* mcbuffer, int bit_depth); + +template +void put_epel_hv_fallback(int16_t *dst, ptrdiff_t dststride, + const pixel_t *_src, ptrdiff_t srcstride, + int width, int height, + int mx, int my, int16_t* mcbuffer, int bit_depth); + + +#define QPEL(x,y) void put_qpel_ ## x ## _ ## y ## _fallback(int16_t *out, ptrdiff_t out_stride, \ + const uint8_t *src, ptrdiff_t srcstride, \ + int nPbW, int nPbH, int16_t* mcbuffer) +QPEL(0,0); QPEL(0,1); QPEL(0,2); QPEL(0,3); +QPEL(1,0); QPEL(1,1); QPEL(1,2); QPEL(1,3); +QPEL(2,0); QPEL(2,1); QPEL(2,2); QPEL(2,3); +QPEL(3,0); QPEL(3,1); QPEL(3,2); QPEL(3,3); + +#undef QPEL + + +#define QPEL(x,y) void put_qpel_ ## x ## _ ## y ## _fallback_16(int16_t *out, ptrdiff_t out_stride, \ + const uint16_t *src, ptrdiff_t srcstride, \ + int nPbW, int nPbH, int16_t* mcbuffer, int bit_depth) +QPEL(0,0); QPEL(0,1); QPEL(0,2); QPEL(0,3); +QPEL(1,0); QPEL(1,1); QPEL(1,2); QPEL(1,3); +QPEL(2,0); QPEL(2,1); QPEL(2,2); QPEL(2,3); +QPEL(3,0); QPEL(3,1); QPEL(3,2); QPEL(3,3); + +#undef QPEL + +#endif diff --git a/libde265/fallback.h b/libde265/fallback.h new file mode 100644 index 0000000..4b0b83c --- /dev/null +++ b/libde265/fallback.h @@ -0,0 +1,28 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef DE265_FALLBACK_H +#define DE265_FALLBACK_H + +#include "acceleration.h" + +void init_acceleration_functions_fallback(struct acceleration_functions* lowlevel); + +#endif diff --git a/libde265/image-io.h b/libde265/image-io.h new file mode 100644 index 0000000..1cc6c8d --- /dev/null +++ b/libde265/image-io.h @@ -0,0 +1,121 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * Authors: struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef IMAGE_IO_H +#define IMAGE_IO_H + +#include "libde265/image.h" +#include + + +class ImageSource +{ + public: + LIBDE265_API ImageSource(); + virtual LIBDE265_API ~ImageSource() { } + + //enum ImageStatus { Available, Waiting, EndOfVideo }; + + //virtual ImageStatus get_status() = 0; + virtual LIBDE265_API de265_image* get_image(bool block=true) = 0; + virtual LIBDE265_API void skip_frames(int n) = 0; + + virtual LIBDE265_API int get_width() const = 0; + virtual LIBDE265_API int get_height() const = 0; +}; + + + +class ImageSource_YUV : public ImageSource +{ + public: + LIBDE265_API ImageSource_YUV(); + virtual LIBDE265_API ~ImageSource_YUV(); + + bool LIBDE265_API set_input_file(const char* filename, int w,int h); + + //virtual ImageStatus get_status(); + virtual LIBDE265_API de265_image* get_image(bool block=true); + virtual LIBDE265_API void skip_frames(int n); + + virtual LIBDE265_API int get_width() const { return width; } + virtual LIBDE265_API int get_height() const { return height; } + + private: + FILE* mFH; + bool mReachedEndOfFile; + + int width,height; + + de265_image* read_next_image(); +}; + + + +class ImageSink +{ + public: + virtual LIBDE265_API ~ImageSink() { } + + virtual LIBDE265_API void send_image(const de265_image* img) = 0; +}; + +class ImageSink_YUV : public ImageSink +{ + public: + LIBDE265_API ImageSink_YUV() : mFH(NULL) { } + LIBDE265_API ~ImageSink_YUV(); + + bool LIBDE265_API set_filename(const char* filename); + + virtual LIBDE265_API void send_image(const de265_image* img); + + private: + FILE* mFH; +}; + + + +class PacketSink +{ + public: + virtual LIBDE265_API ~PacketSink() { } + + virtual LIBDE265_API void send_packet(const uint8_t* data, int n) = 0; +}; + + +class PacketSink_File : public PacketSink +{ + public: + LIBDE265_API PacketSink_File(); + virtual LIBDE265_API ~PacketSink_File(); + + LIBDE265_API void set_filename(const char* filename); + + virtual LIBDE265_API void send_packet(const uint8_t* data, int n); + + private: + FILE* mFH; +}; + +#endif diff --git a/libde265/image.h b/libde265/image.h new file mode 100644 index 0000000..5611b72 --- /dev/null +++ b/libde265/image.h @@ -0,0 +1,864 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef DE265_IMAGE_H +#define DE265_IMAGE_H + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#include +#include +#include +#include +#ifdef HAVE_STDBOOL_H +#include +#endif + +#include "libde265/de265.h" +#include "libde265/sps.h" +#include "libde265/pps.h" +#include "libde265/motion.h" +#include "libde265/threads.h" +#include "libde265/slice.h" +#include "libde265/nal.h" + +struct en265_encoder_context; + +enum PictureState { + UnusedForReference, + UsedForShortTermReference, + UsedForLongTermReference +}; + + +/* TODO: + At INTEGRITY_DERIVED_FROM_FAULTY_REFERENCE images, we can check the SEI hash, whether + the output image is correct despite the faulty reference, and set the state back to correct. +*/ +#define INTEGRITY_CORRECT 0 +#define INTEGRITY_UNAVAILABLE_REFERENCE 1 +#define INTEGRITY_NOT_DECODED 2 +#define INTEGRITY_DECODING_ERRORS 3 +#define INTEGRITY_DERIVED_FROM_FAULTY_REFERENCE 4 + +#define SEI_HASH_UNCHECKED 0 +#define SEI_HASH_CORRECT 1 +#define SEI_HASH_INCORRECT 2 + +#define TU_FLAG_NONZERO_COEFF (1<<7) +#define TU_FLAG_SPLIT_TRANSFORM_MASK 0x1F + +#define DEBLOCK_FLAG_VERTI (1<<4) +#define DEBLOCK_FLAG_HORIZ (1<<5) +#define DEBLOCK_PB_EDGE_VERTI (1<<6) +#define DEBLOCK_PB_EDGE_HORIZ (1<<7) +#define DEBLOCK_BS_MASK 0x03 + + +#define CTB_PROGRESS_NONE 0 +#define CTB_PROGRESS_PREFILTER 1 +#define CTB_PROGRESS_DEBLK_V 2 +#define CTB_PROGRESS_DEBLK_H 3 +#define CTB_PROGRESS_SAO 4 + +class decoder_context; + +template class MetaDataArray +{ + public: + MetaDataArray() { data=NULL; data_size=0; log2unitSize=0; width_in_units=0; height_in_units=0; } + ~MetaDataArray() { free(data); } + + LIBDE265_CHECK_RESULT bool alloc(int w,int h, int _log2unitSize) { + int size = w*h; + + if (size != data_size) { + free(data); + data = (DataUnit*)malloc(size * sizeof(DataUnit)); + if (data == NULL) { + data_size = 0; + return false; + } + data_size = size; + } + + width_in_units = w; + height_in_units = h; + + log2unitSize = _log2unitSize; + + return data != NULL; + } + + void clear() { + if (data) memset(data, 0, sizeof(DataUnit) * data_size); + } + + const DataUnit& get(int x,int y) const { + int unitX = x>>log2unitSize; + int unitY = y>>log2unitSize; + + assert(unitX >= 0 && unitX < width_in_units); + assert(unitY >= 0 && unitY < height_in_units); + + return data[ unitX + unitY*width_in_units ]; + } + + DataUnit& get(int x,int y) { + int unitX = x>>log2unitSize; + int unitY = y>>log2unitSize; + + assert(unitX >= 0 && unitX < width_in_units); + assert(unitY >= 0 && unitY < height_in_units); + + return data[ unitX + unitY*width_in_units ]; + } + + void set(int x,int y, const DataUnit& d) { + int unitX = x>>log2unitSize; + int unitY = y>>log2unitSize; + + assert(unitX >= 0 && unitX < width_in_units); + assert(unitY >= 0 && unitY < height_in_units); + + data[ unitX + unitY*width_in_units ] = d; + } + + DataUnit& operator[](int idx) { return data[idx]; } + const DataUnit& operator[](int idx) const { return data[idx]; } + + int size() const { return data_size; } + + // private: + DataUnit* data; + int data_size; + int log2unitSize; + int width_in_units; + int height_in_units; +}; + +#define SET_CB_BLK(x,y,log2BlkWidth, Field,value) \ + int cbX = x >> cb_info.log2unitSize; \ + int cbY = y >> cb_info.log2unitSize; \ + int width = 1 << (log2BlkWidth - cb_info.log2unitSize); \ + for (int cby=cbY;cby> tu_info.log2unitSize; \ + int tuY = y >> tu_info.log2unitSize; \ + int width = 1 << (log2BlkWidth - tu_info.log2unitSize); \ + for (int tuy=tuY;tuy sps, + bool allocMetadata, + decoder_context* dctx, + //class encoder_context* ectx, + de265_PTS pts, void* user_data, + bool useCustomAllocFunctions); + + //de265_error alloc_encoder_data(const seq_parameter_set* sps); + + bool is_allocated() const { return pixels[0] != NULL; } + + void release(); + + void set_headers(std::shared_ptr _vps, + std::shared_ptr _sps, + std::shared_ptr _pps) { + vps = _vps; + sps = _sps; + pps = _pps; + } + + void fill_image(int y,int u,int v); + de265_error copy_image(const de265_image* src); + void copy_lines_from(const de265_image* src, int first, int end); + void exchange_pixel_data_with(de265_image&); + + uint32_t get_ID() const { return ID; } + + + /* */ uint8_t* get_image_plane(int cIdx) { return pixels[cIdx]; } + const uint8_t* get_image_plane(int cIdx) const { return pixels[cIdx]; } + + void set_image_plane(int cIdx, uint8_t* mem, int stride, void *userdata); + + uint8_t* get_image_plane_at_pos(int cIdx, int xpos,int ypos) + { + int stride = get_image_stride(cIdx); + return pixels[cIdx] + xpos + ypos*stride; + } + + + /// xpos;ypos in actual plane resolution + template + pixel_t* get_image_plane_at_pos_NEW(int cIdx, int xpos,int ypos) + { + int stride = get_image_stride(cIdx); + return (pixel_t*)(pixels[cIdx] + (xpos + ypos*stride)*sizeof(pixel_t)); + } + + const uint8_t* get_image_plane_at_pos(int cIdx, int xpos,int ypos) const + { + int stride = get_image_stride(cIdx); + return pixels[cIdx] + xpos + ypos*stride; + } + + void* get_image_plane_at_pos_any_depth(int cIdx, int xpos,int ypos) + { + int stride = get_image_stride(cIdx); + return pixels[cIdx] + ((xpos + ypos*stride) << bpp_shift[cIdx]); + } + + const void* get_image_plane_at_pos_any_depth(int cIdx, int xpos,int ypos) const + { + int stride = get_image_stride(cIdx); + return pixels[cIdx] + ((xpos + ypos*stride) << bpp_shift[cIdx]); + } + + /* Number of pixels in one row (not number of bytes). + */ + int get_image_stride(int cIdx) const + { + if (cIdx==0) return stride; + else return chroma_stride; + } + + int get_luma_stride() const { return stride; } + int get_chroma_stride() const { return chroma_stride; } + + int get_width (int cIdx=0) const { return cIdx==0 ? width : chroma_width; } + int get_height(int cIdx=0) const { return cIdx==0 ? height : chroma_height; } + + enum de265_chroma get_chroma_format() const { return chroma_format; } + + int get_bit_depth(int cIdx) const { + if (cIdx==0) return sps->BitDepth_Y; + else return sps->BitDepth_C; + } + + int get_bytes_per_pixel(int cIdx) const { + return (get_bit_depth(cIdx)+7)/8; + } + + bool high_bit_depth(int cIdx) const { + return get_bit_depth(cIdx)>8; + } + + bool can_be_released() const { return PicOutputFlag==false && PicState==UnusedForReference; } + + + void add_slice_segment_header(slice_segment_header* shdr) { + shdr->slice_index = slices.size(); + slices.push_back(shdr); + } + + + bool available_zscan(int xCurr,int yCurr, int xN,int yN) const; + + bool available_pred_blk(int xC,int yC, int nCbS, + int xP, int yP, int nPbW, int nPbH, int partIdx, + int xN,int yN) const; + + + static de265_image_allocation default_image_allocation; + + void printBlk(const char* title, int x0,int y0,int blkSize,int cIdx) const { + ::printBlk(title, get_image_plane_at_pos(cIdx,x0,y0), + blkSize, get_image_stride(cIdx)); + } + +private: + uint32_t ID; + static uint32_t s_next_image_ID; + + uint8_t* pixels[3]; + uint8_t bpp_shift[3]; // 0 for 8 bit, 1 for 16 bit + + enum de265_chroma chroma_format; + + int width, height; // size in luma pixels + + int chroma_width, chroma_height; + int stride, chroma_stride; + +public: + uint8_t BitDepth_Y, BitDepth_C; + uint8_t SubWidthC, SubHeightC; + std::vector slices; + +public: + + // --- conformance cropping window --- + + uint8_t* pixels_confwin[3]; // pointer to pixels in the conformance window + + int width_confwin, height_confwin; + int chroma_width_confwin, chroma_height_confwin; + + // --- decoding info --- + + // If PicOutputFlag==false && PicState==UnusedForReference, image buffer is free. + + int picture_order_cnt_lsb; + int PicOrderCntVal; + enum PictureState PicState; + bool PicOutputFlag; + + int32_t removed_at_picture_id; + + const video_parameter_set& get_vps() const { return *vps; } + const seq_parameter_set& get_sps() const { return *sps; } + const pic_parameter_set& get_pps() const { return *pps; } + + bool has_vps() const { return (bool)vps; } + bool has_sps() const { return (bool)sps; } + bool has_pps() const { return (bool)pps; } + + std::shared_ptr get_shared_sps() { return sps; } + + //std::shared_ptr get_shared_sps() const { return sps; } + //std::shared_ptr get_shared_pps() const { return pps; } + + decoder_context* decctx; + //class encoder_context* encctx; + + int number_of_ctbs() const { return ctb_info.size(); } + +private: + // The image also keeps a reference to VPS/SPS/PPS, because when decoding is delayed, + // the currently active parameter sets in the decctx might already have been replaced + // with new parameters. + std::shared_ptr vps; + std::shared_ptr sps; // the SPS used for decoding this image + std::shared_ptr pps; // the PPS used for decoding this image + + MetaDataArray ctb_info; + MetaDataArray cb_info; + MetaDataArray pb_info; + MetaDataArray intraPredMode; + MetaDataArray intraPredModeC; + MetaDataArray tu_info; + MetaDataArray deblk_info; + +public: + // --- meta information --- + + de265_PTS pts; + void* user_data; + void* plane_user_data[3]; // this is logically attached to the pixel data pointers + de265_image_allocation image_allocation_functions; // the functions used for memory allocation + + /* + void (*encoder_image_release_func)(en265_encoder_context*, + de265_image*, + void* userdata); + */ + + uint8_t integrity; /* Whether an error occured while the image was decoded. + When generated, this is initialized to INTEGRITY_CORRECT, + and changed on decoding errors. + */ + bool sei_hash_check_result; + + nal_header nal_hdr; + + // --- multi core --- + + de265_progress_lock* ctb_progress; // ctb_info_size + + void mark_all_CTB_progress(int progress) { + for (int i=0;i> tu_info.log2unitSize; + const int tuY = y >> tu_info.log2unitSize; + const int width = 1 << (log2TrafoSize - tu_info.log2unitSize); + + for (int tuy=tuY;tuy>sps->Log2MinPUSize) + (y0>>sps->Log2MinPUSize)*sps->PicWidthInMinPUs; + + for (int y=0;yPicWidthInMinPUs); + assert(y < sps->PicHeightInMinPUs); + + int idx = PUidx + x + y*intraPredMode.width_in_units; + assert(idx>sps->Log2MinPUSize) + (y0>>sps->Log2MinPUSize)*sps->PicWidthInMinPUs; + + for (int y=0;yPicWidthInMinPUs); + assert(yPicHeightInMinPUs); + + int idx = PUidx + x + y*intraPredModeC.width_in_units; + assert(idx= 0 && idx < slices.size(); + } + + slice_segment_header* get_SliceHeader(int x, int y) + { + int idx = get_SliceHeaderIndex(x,y); + if (idx >= slices.size()) { return NULL; } + return slices[idx]; + } + + slice_segment_header* get_SliceHeaderCtb(int ctbX, int ctbY) + { + int idx = get_SliceHeaderIndexCtb(ctbX,ctbY); + if (idx >= slices.size()) { return NULL; } + return slices[idx]; + } + + const slice_segment_header* get_SliceHeaderCtb(int ctbX, int ctbY) const + { + int idx = get_SliceHeaderIndexCtb(ctbX,ctbY); + if (idx >= slices.size()) { return NULL; } + return slices[idx]; + } + + void set_sao_info(int ctbX,int ctbY,const sao_info* saoinfo) + { + sao_info* sao = &ctb_info[ctbX + ctbY*ctb_info.width_in_units].saoInfo; + + memcpy(sao, + saoinfo, + sizeof(sao_info)); + } + + const sao_info* get_sao_info(int ctbX,int ctbY) const + { + return &ctb_info[ctbX + ctbY*ctb_info.width_in_units].saoInfo; + } + + + void set_CtbDeblockFlag(int ctbX, int ctbY, bool flag) + { + int idx = ctbX + ctbY*ctb_info.width_in_units; + ctb_info[idx].deblock = flag; + } + + bool get_CtbDeblockFlag(int ctbX, int ctbY) const + { + return ctb_info[ctbX + ctbY*ctb_info.width_in_units].deblock; + } + + + bool get_CTB_has_pcm_or_cu_transquant_bypass(int ctbX,int ctbY) const + { + int idx = ctbX + ctbY*ctb_info.width_in_units; + return ctb_info[idx].has_pcm_or_cu_transquant_bypass; + } + + + + // --- DEBLK metadata access --- + + int get_deblk_width() const { return deblk_info.width_in_units; } + int get_deblk_height() const { return deblk_info.height_in_units; } + + void set_deblk_flags(int x0,int y0, uint8_t flags) + { + const int xd = x0/4; + const int yd = y0/4; + + if (xd + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef DE265_INTRAPRED_H +#define DE265_INTRAPRED_H + +#include "libde265/decctx.h" + +extern const int intraPredAngle_table[1+34]; + + +/* Fill the three intra-pred-mode candidates into candModeList. + Block position is (x,y) and you also have to give the PUidx for this + block (which is (x>>Log2MinPUSize) + (y>>Log2MinPUSize)*PicWidthInMinPUs). + availableA/B is the output of check_CTB_available(). + */ +void fillIntraPredModeCandidates(enum IntraPredMode candModeList[3], + int x,int y, int PUidx, + bool availableA, // left + bool availableB, // top + const de265_image* img); + + +inline void fillIntraPredModeCandidates(enum IntraPredMode candModeList[3], int x,int y, + bool availableA, // left + bool availableB, // top + const de265_image* img) +{ + int PUidx = img->get_sps().getPUIndexRS(x,y); + fillIntraPredModeCandidates(candModeList, x,y, PUidx, availableA,availableB, img); +} + +void fillIntraPredModeCandidates(enum IntraPredMode candModeList[3], + enum IntraPredMode candIntraPredModeA, + enum IntraPredMode candIntraPredModeB); + + +/* Return value >= 0 -> use mpm_idx(return value) + else -> use rem_intra(-return value-1) + + This function may modify the candModeList ! + */ +int find_intra_pred_mode(enum IntraPredMode mode, + enum IntraPredMode candModeList[3]); + +void list_chroma_pred_candidates(enum IntraPredMode chroma_mode[5], + enum IntraPredMode luma_mode); + +int get_intra_scan_idx(int log2TrafoSize, enum IntraPredMode intraPredMode, int cIdx, + const seq_parameter_set* sps); + +int get_intra_scan_idx_luma (int log2TrafoSize, enum IntraPredMode intraPredMode); // DEPRECATED +int get_intra_scan_idx_chroma(int log2TrafoSize, enum IntraPredMode intraPredMode); // DEPRECATED + +enum IntraPredMode lumaPredMode_to_chromaPredMode(enum IntraPredMode luma, + enum IntraChromaPredMode chroma); + +/* +void decode_intra_block(decoder_context* ctx, + thread_context* tctx, + int cIdx, + int xB0,int yB0, // position of TU in frame (chroma adapted) + int x0,int y0, // position of CU in frame (chroma adapted) + int log2TrafoSize, int trafoDepth, + enum IntraPredMode intraPredMode, + bool transform_skip_flag); +*/ + +//void fill_border_samples(decoder_context* ctx, int xB,int yB, +// int nT, int cIdx, uint8_t* out_border); + +void decode_intra_prediction(de265_image* img, + int xB0,int yB0, + enum IntraPredMode intraPredMode, + int nT, int cIdx); + +// TODO: remove this +template void decode_intra_prediction(de265_image* img, + int xB0,int yB0, + enum IntraPredMode intraPredMode, + pixel_t* dst, int nT, int cIdx); + + + + +// --- internal use only --- + +// Actually, the largest TB block can only be 32, but in some intra-pred-mode algorithms +// (e.g. min-residual), we may call intra prediction on the maximum CTB size (64). +static const int MAX_INTRA_PRED_BLOCK_SIZE = 64; + + +template +class intra_border_computer +{ + public: + pixel_t* out_border; + + const de265_image* img; + int nT; + int cIdx; + + int xB,yB; + + const seq_parameter_set* sps; + const pic_parameter_set* pps; + + uint8_t available_data[4*MAX_INTRA_PRED_BLOCK_SIZE + 1]; + uint8_t* available; + + int SubWidth; + int SubHeight; + + bool availableLeft; // is CTB at left side available? + bool availableTop; // is CTB at top side available? + bool availableTopRight; // is CTB at top-right side available? + bool availableTopLeft; // if CTB at top-left pixel available? + + int nBottom; + int nRight; + int nAvail; + pixel_t firstValue; + + void init(pixel_t* _out_border, + const de265_image* _img, int _nT, int _cIdx, int _xB, int _yB) { + img=_img; nT=_nT; cIdx=_cIdx; + out_border=_out_border; xB=_xB; yB=_yB; + + assert(nT <= MAX_INTRA_PRED_BLOCK_SIZE); + + availableLeft=true; + availableTop=true; + availableTopRight=true; + availableTopLeft=true; + } + void preproc(); + void fill_from_image(); + + void reference_sample_substitution(); +}; + + +#ifdef DE265_LOG_TRACE +template +void print_border(pixel_t* data, uint8_t* available, int nT) +{ + for (int i=-2*nT ; i<=2*nT ; i++) { + if (i==0 || i==1 || i==-nT || i==nT+1) { + logtrace(LogIntraPred,"|"); + } else { + logtrace(LogIntraPred," "); + } + + if (available==NULL || available[i]) { + logtrace(LogIntraPred,"%02x",data[i]); + } + else { + logtrace(LogIntraPred,"--"); + } + } +} +#else +#define print_border(data, available, nT) +#endif + + +// (8.4.4.2.3) +template +void intra_prediction_sample_filtering(const seq_parameter_set& sps, + pixel_t* p, + int nT, int cIdx, + enum IntraPredMode intraPredMode) +{ + int filterFlag; + + //printf("filtering, mode: %d\n",intraPredMode); + + if (intraPredMode==INTRA_DC || nT==4) { + filterFlag = 0; + } else { + // int-cast below prevents a typing problem that leads to wrong results when abs_value is a macro + int minDistVerHor = libde265_min( abs_value((int)intraPredMode-26), + abs_value((int)intraPredMode-10) ); + + //printf("mindist: %d\n",minDistVerHor); + + switch (nT) { + case 8: filterFlag = (minDistVerHor>7) ? 1 : 0; break; + case 16: filterFlag = (minDistVerHor>1) ? 1 : 0; break; + case 32: filterFlag = (minDistVerHor>0) ? 1 : 0; break; + // there is no official 64x64 TB block, but we call this for some intra-pred mode algorithms + // on the whole CB (2Nx2N mode for the whole CTB) + case 64: filterFlag = 0; break; + default: filterFlag = -1; assert(false); break; // should never happen + } + } + + + if (filterFlag) { + int biIntFlag = (sps.strong_intra_smoothing_enable_flag && + cIdx==0 && + nT==32 && + abs_value(p[0]+p[ 64]-2*p[ 32]) < (1<<(sps.bit_depth_luma-5)) && + abs_value(p[0]+p[-64]-2*p[-32]) < (1<<(sps.bit_depth_luma-5))) + ? 1 : 0; + + pixel_t pF_mem[4*32+1]; + pixel_t* pF = &pF_mem[2*32]; + + if (biIntFlag) { + pF[-2*nT] = p[-2*nT]; + pF[ 2*nT] = p[ 2*nT]; + pF[ 0] = p[ 0]; + + for (int i=1;i<=63;i++) { + pF[-i] = p[0] + ((i*(p[-64]-p[0])+32)>>6); + pF[ i] = p[0] + ((i*(p[ 64]-p[0])+32)>>6); + } + } else { + pF[-2*nT] = p[-2*nT]; + pF[ 2*nT] = p[ 2*nT]; + + for (int i=-(2*nT-1) ; i<=2*nT-1 ; i++) + { + pF[i] = (p[i+1] + 2*p[i] + p[i-1] + 2) >> 2; + } + } + + + // copy back to original array + + memcpy(p-2*nT, pF-2*nT, (4*nT+1) * sizeof(pixel_t)); + } + else { + // do nothing ? + } + + + logtrace(LogIntraPred,"post filtering: "); + print_border(p,NULL,nT); + logtrace(LogIntraPred,"\n"); +} + + +template +void intra_prediction_planar(pixel_t* dst, int dstStride, + int nT,int cIdx, + pixel_t* border) +{ + int Log2_nT = Log2(nT); + + for (int y=0;y> (Log2_nT+1); + } + + + logtrace(LogIntraPred,"result of planar prediction\n"); + + for (int y=0;y +void intra_prediction_DC(pixel_t* dst, int dstStride, + int nT,int cIdx, + pixel_t* border) +{ + int Log2_nT = Log2(nT); + + int dcVal = 0; + for (int i=0;i>= Log2_nT+1; + + if (cIdx==0 && nT<32) { + dst[0] = (border[-1] + 2*dcVal + border[1] +2) >> 2; + + for (int x=1;x>2; } + for (int y=1;y>2; } + for (int y=1;y +void intra_prediction_angular(pixel_t* dst, int dstStride, + int bit_depth, bool disableIntraBoundaryFilter, + int xB0,int yB0, + enum IntraPredMode intraPredMode, + int nT,int cIdx, + pixel_t* border) +{ + pixel_t ref_mem[4*MAX_INTRA_PRED_BLOCK_SIZE+1]; // TODO: what is the required range here ? + pixel_t* ref=&ref_mem[2*MAX_INTRA_PRED_BLOCK_SIZE]; + + assert(intraPredMode<35); + assert(intraPredMode>=2); + + int intraPredAngle = intraPredAngle_table[intraPredMode]; + + if (intraPredMode >= 18) { + + for (int x=0;x<=nT;x++) + { ref[x] = border[x]; } + + if (intraPredAngle<0) { + int invAngle = invAngle_table[intraPredMode-11]; + + if ((nT*intraPredAngle)>>5 < -1) { + for (int x=(nT*intraPredAngle)>>5; x<=-1; x++) { + ref[x] = border[0-((x*invAngle+128)>>8)]; + } + } + } else { + for (int x=nT+1; x<=2*nT;x++) { + ref[x] = border[x]; + } + } + + for (int y=0;y>5; + int iFact= ((y+1)*intraPredAngle)&31; + + if (iFact != 0) { + dst[x+y*dstStride] = ((32-iFact)*ref[x+iIdx+1] + iFact*ref[x+iIdx+2] + 16)>>5; + } else { + dst[x+y*dstStride] = ref[x+iIdx+1]; + } + } + + if (intraPredMode==26 && cIdx==0 && nT<32 && !disableIntraBoundaryFilter) { + for (int y=0;y>1), bit_depth); + } + } + } + else { // intraPredAngle < 18 + + for (int x=0;x<=nT;x++) + { ref[x] = border[-x]; } // DIFF (neg) + + if (intraPredAngle<0) { + int invAngle = invAngle_table[intraPredMode-11]; + + if ((nT*intraPredAngle)>>5 < -1) { + for (int x=(nT*intraPredAngle)>>5; x<=-1; x++) { + ref[x] = border[((x*invAngle+128)>>8)]; // DIFF (neg) + } + } + } else { + for (int x=nT+1; x<=2*nT;x++) { + ref[x] = border[-x]; // DIFF (neg) + } + } + + for (int y=0;y>5; // DIFF (x<->y) + int iFact= ((x+1)*intraPredAngle)&31; // DIFF (x<->y) + + if (iFact != 0) { + dst[x+y*dstStride] = ((32-iFact)*ref[y+iIdx+1] + iFact*ref[y+iIdx+2] + 16)>>5; // DIFF (x<->y) + } else { + dst[x+y*dstStride] = ref[y+iIdx+1]; // DIFF (x<->y) + } + } + + if (intraPredMode==10 && cIdx==0 && nT<32 && !disableIntraBoundaryFilter) { // DIFF 26->10 + for (int x=0;xy) + dst[x] = Clip_BitDepth(border[-1] + ((border[1+x] - border[0])>>1), bit_depth); // DIFF (x<->y && neg) + } + } + } + + + logtrace(LogIntraPred,"result of angular intra prediction (mode=%d):\n",intraPredMode); + + for (int y=0;y +void intra_border_computer::preproc() +{ + sps = &img->get_sps(); + pps = &img->get_pps(); + + SubWidth = (cIdx==0) ? 1 : sps->SubWidthC; + SubHeight = (cIdx==0) ? 1 : sps->SubHeightC; + + // --- check for CTB boundaries --- + + int xBLuma = xB * SubWidth; + int yBLuma = yB * SubHeight; + + int log2CtbSize = sps->Log2CtbSizeY; + int picWidthInCtbs = sps->PicWidthInCtbsY; + + + //printf("xB/yB: %d %d\n",xB,yB); + + // are we at left image border + + if (xBLuma == 0) { + availableLeft = false; + availableTopLeft = false; + xBLuma = 0; // fake value, available flags are already set to false + } + + + // are we at top image border + + if (yBLuma == 0) { + availableTop = false; + availableTopLeft = false; + availableTopRight = false; + yBLuma = 0; // fake value, available flags are already set to false + } + + if (xBLuma+nT*SubWidth >= sps->pic_width_in_luma_samples) { + availableTopRight=false; + } + + // check for tile and slice boundaries + + int xCurrCtb = xBLuma >> log2CtbSize; + int yCurrCtb = yBLuma >> log2CtbSize; + int xLeftCtb = (xBLuma-1) >> log2CtbSize; + int xRightCtb = (xBLuma+nT*SubWidth) >> log2CtbSize; + int yTopCtb = (yBLuma-1) >> log2CtbSize; + + int currCTBSlice = img->get_SliceAddrRS(xCurrCtb,yCurrCtb); + int leftCTBSlice = availableLeft ? img->get_SliceAddrRS(xLeftCtb, yCurrCtb) : -1; + int topCTBSlice = availableTop ? img->get_SliceAddrRS(xCurrCtb, yTopCtb) : -1; + int toprightCTBSlice = availableTopRight ? img->get_SliceAddrRS(xRightCtb, yTopCtb) : -1; + int topleftCTBSlice = availableTopLeft ? img->get_SliceAddrRS(xLeftCtb, yTopCtb) : -1; + + /* + printf("size: %d\n",pps->TileIdRS.size()); + printf("curr: %d left: %d top: %d\n", + xCurrCtb+yCurrCtb*picWidthInCtbs, + availableLeft ? xLeftCtb+yCurrCtb*picWidthInCtbs : 9999, + availableTop ? xCurrCtb+yTopCtb*picWidthInCtbs : 9999); + */ + int currCTBTileID = pps->TileIdRS[xCurrCtb+yCurrCtb*picWidthInCtbs]; + int leftCTBTileID = availableLeft ? pps->TileIdRS[xLeftCtb+yCurrCtb*picWidthInCtbs] : -1; + int topCTBTileID = availableTop ? pps->TileIdRS[xCurrCtb+yTopCtb*picWidthInCtbs] : -1; + int topleftCTBTileID = availableTopLeft ? pps->TileIdRS[xLeftCtb+yTopCtb*picWidthInCtbs] : -1; + int toprightCTBTileID= availableTopRight? pps->TileIdRS[xRightCtb+yTopCtb*picWidthInCtbs] : -1; + + if (leftCTBSlice != currCTBSlice || leftCTBTileID != currCTBTileID ) availableLeft = false; + if (topCTBSlice != currCTBSlice || topCTBTileID != currCTBTileID ) availableTop = false; + if (topleftCTBSlice !=currCTBSlice||topleftCTBTileID!=currCTBTileID ) availableTopLeft = false; + if (toprightCTBSlice!=currCTBSlice||toprightCTBTileID!=currCTBTileID) availableTopRight= false; + + + // number of pixels that are in the valid image area to the right and to the bottom + + nBottom = sps->pic_height_in_luma_samples - yB*SubHeight; + nBottom=(nBottom+SubHeight-1)/SubHeight; + if (nBottom>2*nT) nBottom=2*nT; + + nRight = sps->pic_width_in_luma_samples - xB*SubWidth; + nRight =(nRight +SubWidth-1)/SubWidth; + if (nRight >2*nT) nRight=2*nT; + + nAvail=0; + + available = &available_data[2*MAX_INTRA_PRED_BLOCK_SIZE]; + + memset(available-2*nT, 0, 4*nT+1); +} + + +template +void intra_border_computer::fill_from_image() +{ + assert(nT<=32); + + pixel_t* image; + int stride; + image = (pixel_t*)img->get_image_plane(cIdx); + stride = img->get_image_stride(cIdx); + + int xBLuma = xB * SubWidth; + int yBLuma = yB * SubHeight; + + int currBlockAddr = pps->MinTbAddrZS[ (xBLuma>>sps->Log2MinTrafoSize) + + (yBLuma>>sps->Log2MinTrafoSize) * sps->PicWidthInTbsY ]; + + + // copy pixels at left column + + for (int y=nBottom-1 ; y>=0 ; y-=4) + if (availableLeft) + { + int NBlockAddr = pps->MinTbAddrZS[ (((xB-1)*SubWidth )>>sps->Log2MinTrafoSize) + + (((yB+y)*SubHeight)>>sps->Log2MinTrafoSize) + * sps->PicWidthInTbsY ]; + + bool availableN = NBlockAddr <= currBlockAddr; + + if (pps->constrained_intra_pred_flag) { + if (img->get_pred_mode((xB-1)*SubWidth,(yB+y)*SubHeight)!=MODE_INTRA) + availableN = false; + } + + if (availableN) { + if (!nAvail) firstValue = image[xB-1 + (yB+y)*stride]; + + for (int i=0;i<4;i++) { + available[-y+i-1] = availableN; + out_border[-y+i-1] = image[xB-1 + (yB+y-i)*stride]; + } + + nAvail+=4; + } + } + + // copy pixel at top-left position + + if (availableTopLeft) + { + int NBlockAddr = pps->MinTbAddrZS[ (((xB-1)*SubWidth )>>sps->Log2MinTrafoSize) + + (((yB-1)*SubHeight)>>sps->Log2MinTrafoSize) + * sps->PicWidthInTbsY ]; + + bool availableN = NBlockAddr <= currBlockAddr; + + if (pps->constrained_intra_pred_flag) { + if (img->get_pred_mode((xB-1)*SubWidth,(yB-1)*SubHeight)!=MODE_INTRA) { + availableN = false; + } + } + + if (availableN) { + if (!nAvail) firstValue = image[xB-1 + (yB-1)*stride]; + + out_border[0] = image[xB-1 + (yB-1)*stride]; + available[0] = availableN; + nAvail++; + } + } + + // copy pixels at top row + + for (int x=0 ; xMinTbAddrZS[ (((xB+x)*SubWidth )>>sps->Log2MinTrafoSize) + + (((yB-1)*SubHeight)>>sps->Log2MinTrafoSize) + * sps->PicWidthInTbsY ]; + + bool availableN = NBlockAddr <= currBlockAddr; + + if (pps->constrained_intra_pred_flag) { + if (img->get_pred_mode((xB+x)*SubWidth,(yB-1)*SubHeight)!=MODE_INTRA) { + availableN = false; + } + } + + + if (availableN) { + if (!nAvail) firstValue = image[xB+x + (yB-1)*stride]; + + for (int i=0;i<4;i++) { + out_border[x+i+1] = image[xB+x+i + (yB-1)*stride]; + available[x+i+1] = availableN; + } + + nAvail+=4; + } + } + } +} + + + +template +void intra_border_computer::reference_sample_substitution() +{ + // reference sample substitution + + const int bit_depth = img->get_bit_depth(cIdx); + + if (nAvail!=4*nT+1) { + if (nAvail==0) { + if (sizeof(pixel_t)==1) { + memset(out_border-2*nT, 1<<(bit_depth-1), 4*nT+1); + } + else { + for (int i = -2*nT; i <= 2*nT ; i++) { + out_border[i] = 1<<(bit_depth-1); + } + } + } + else { + if (!available[-2*nT]) { + out_border[-2*nT] = firstValue; + } + + for (int i=-2*nT+1; i<=2*nT; i++) + if (!available[i]) { + out_border[i]=out_border[i-1]; + } + } + } + + logtrace(LogIntraPred,"availableN: "); + print_border(available,NULL,nT); + logtrace(LogIntraPred,"\n"); + + logtrace(LogIntraPred,"output: "); + print_border(out_border,NULL,nT); + logtrace(LogIntraPred,"\n"); +} + + +#endif diff --git a/libde265/md5.h b/libde265/md5.h new file mode 100644 index 0000000..f1a6857 --- /dev/null +++ b/libde265/md5.h @@ -0,0 +1,45 @@ +/* + * This is an OpenSSL-compatible implementation of the RSA Data Security, Inc. + * MD5 Message-Digest Algorithm (RFC 1321). + * + * Homepage: + * http://openwall.info/wiki/people/solar/software/public-domain-source-code/md5 + * + * Author: + * Alexander Peslyak, better known as Solar Designer + * + * This software was written by Alexander Peslyak in 2001. No copyright is + * claimed, and the software is hereby placed in the public domain. + * In case this attempt to disclaim copyright and place the software in the + * public domain is deemed null and void, then the software is + * Copyright (c) 2001 Alexander Peslyak and it is hereby released to the + * general public under the following terms: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted. + * + * There's ABSOLUTELY NO WARRANTY, express or implied. + * + * See md5.c for more information. + */ + +#ifdef HAVE_OPENSSL +#include +#elif !defined(_MD5_H) +#define _MD5_H + +/* Any 32-bit or wider unsigned integer data type will do */ +typedef unsigned int MD5_u32plus; + +typedef struct { + MD5_u32plus lo, hi; + MD5_u32plus a, b, c, d; + unsigned char buffer[64]; + MD5_u32plus block[16]; +} MD5_CTX; + +extern void MD5_Init(MD5_CTX *ctx); +extern void MD5_Update(MD5_CTX *ctx, void *data, unsigned long size); +extern void MD5_Final(unsigned char *result, MD5_CTX *ctx); + +#endif diff --git a/libde265/motion.h b/libde265/motion.h new file mode 100644 index 0000000..12d7791 --- /dev/null +++ b/libde265/motion.h @@ -0,0 +1,131 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef DE265_MOTION_H +#define DE265_MOTION_H + +#include +#include "slice.h" + +class base_context; +class slice_segment_header; + +class MotionVector +{ + public: + int16_t x,y; +}; + + +class PBMotion +{ + public: + uint8_t predFlag[2]; // which of the two vectors is actually used + int8_t refIdx[2]; // index into RefPicList + MotionVector mv[2]; // the absolute motion vectors + + bool operator==(const PBMotion&) const; +}; + + +class PBMotionCoding +{ + public: + // index into RefPicList + int8_t refIdx[2]; + + // motion vector difference + int16_t mvd[2][2]; // [L0/L1][x/y] (only in top left position - ???) + + // enum InterPredIdc, whether this is prediction from L0,L1, or BI + uint8_t inter_pred_idc : 2; + + // which of the two MVPs is used + uint8_t mvp_l0_flag : 1; + uint8_t mvp_l1_flag : 1; + + // whether merge mode is used + uint8_t merge_flag : 1; + uint8_t merge_idx : 3; +}; + + +void get_merge_candidate_list(base_context* ctx, + const slice_segment_header* shdr, + struct de265_image* img, + int xC,int yC, int xP,int yP, + int nCS, int nPbW,int nPbH, int partIdx, + PBMotion* mergeCandList); + +/* +int derive_spatial_merging_candidates(const struct de265_image* img, + int xC, int yC, int nCS, int xP, int yP, + uint8_t singleMCLFlag, + int nPbW, int nPbH, + int partIdx, + MotionVectorSpec* out_cand, + int maxCandidates); +*/ + +void generate_inter_prediction_samples(base_context* ctx, + const slice_segment_header* shdr, + struct de265_image* img, + int xC,int yC, + int xB,int yB, + int nCS, int nPbW,int nPbH, + const PBMotion* vi); + + +/* Fill list (two entries) of motion-vector predictors for MVD coding. + */ +void fill_luma_motion_vector_predictors(base_context* ctx, + const slice_segment_header* shdr, + de265_image* img, + int xC,int yC,int nCS,int xP,int yP, + int nPbW,int nPbH, int l, + int refIdx, int partIdx, + MotionVector out_mvpList[2]); + + +void decode_prediction_unit(base_context* ctx,const slice_segment_header* shdr, + de265_image* img, const PBMotionCoding& motion, + int xC,int yC, int xB,int yB, int nCS, int nPbW,int nPbH, int partIdx); + + + + +class MotionVectorAccess +{ +public: + virtual enum PartMode get_PartMode(int x,int y) const = 0; + virtual const PBMotion& get_mv_info(int x,int y) const = 0; +}; + + +void get_merge_candidate_list_without_step_9(base_context* ctx, + const slice_segment_header* shdr, + const MotionVectorAccess& mvaccess, + de265_image* img, + int xC,int yC, int xP,int yP, + int nCS, int nPbW,int nPbH, int partIdx, + int max_merge_idx, + PBMotion* mergeCandList); + +#endif diff --git a/libde265/nal-parser.h b/libde265/nal-parser.h new file mode 100644 index 0000000..a63a7fd --- /dev/null +++ b/libde265/nal-parser.h @@ -0,0 +1,154 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef DE265_NAL_PARSER_H +#define DE265_NAL_PARSER_H + +#include "libde265/sps.h" +#include "libde265/pps.h" +#include "libde265/nal.h" +#include "libde265/util.h" + +#include +#include + +#define DE265_NAL_FREE_LIST_SIZE 16 +#define DE265_SKIPPED_BYTES_INITIAL_SIZE 16 + + +class NAL_unit { + public: + NAL_unit(); + ~NAL_unit(); + + nal_header header; + + de265_PTS pts; + void* user_data; + + + void clear(); + + // --- rbsp data --- + + LIBDE265_CHECK_RESULT bool resize(int new_size); + LIBDE265_CHECK_RESULT bool append(const unsigned char* data, int n); + LIBDE265_CHECK_RESULT bool set_data(const unsigned char* data, int n); + + int size() const { return data_size; } + void set_size(int s) { data_size=s; } + unsigned char* data() { return nal_data; } + const unsigned char* data() const { return nal_data; } + + + // --- skipped stuffing bytes --- + + int num_skipped_bytes_before(int byte_position, int headerLength) const; + int num_skipped_bytes() const { return skipped_bytes.size(); } + + //void clear_skipped_bytes() { skipped_bytes.clear(); } + + /* Mark a byte as skipped. It is assumed that the byte is already removed + from the input data. The NAL data is not modified. + */ + void insert_skipped_byte(int pos); + + /* Remove all stuffing bytes from NAL data. The NAL data is modified and + the removed bytes are marked as skipped bytes. + */ + void remove_stuffing_bytes(); + + private: + unsigned char* nal_data; + int data_size; + int capacity; + + std::vector skipped_bytes; // up to position[x], there were 'x' skipped bytes +}; + + +class NAL_Parser +{ + public: + NAL_Parser(); + ~NAL_Parser(); + + de265_error push_data(const unsigned char* data, int len, + de265_PTS pts, void* user_data = NULL); + + de265_error push_NAL(const unsigned char* data, int len, + de265_PTS pts, void* user_data = NULL); + + NAL_unit* pop_from_NAL_queue(); + de265_error flush_data(); + void mark_end_of_stream() { end_of_stream=true; } + void mark_end_of_frame() { end_of_frame=true; } + void remove_pending_input_data(); + + int bytes_in_input_queue() const { + int size = nBytes_in_NAL_queue; + if (pending_input_NAL) { size += pending_input_NAL->size(); } + return size; + } + + int number_of_NAL_units_pending() const { + int size = NAL_queue.size(); + if (pending_input_NAL) { size++; } + return size; + } + + int number_of_complete_NAL_units_pending() const { + return NAL_queue.size(); + } + + void free_NAL_unit(NAL_unit*); + + + int get_NAL_queue_length() const { return NAL_queue.size(); } + bool is_end_of_stream() const { return end_of_stream; } + bool is_end_of_frame() const { return end_of_frame; } + + private: + // byte-stream level + + bool end_of_stream; // data in pending_input_data is end of stream + bool end_of_frame; // data in pending_input_data is end of frame + int input_push_state; + + NAL_unit* pending_input_NAL; + + + // NAL level + + std::queue NAL_queue; // enqueued NALs have suffing bytes removed + int nBytes_in_NAL_queue; // data bytes currently in NAL_queue + + void push_to_NAL_queue(NAL_unit*); + + + // pool of unused NAL memory + + std::vector NAL_free_list; // maximum size: DE265_NAL_FREE_LIST_SIZE + + LIBDE265_CHECK_RESULT NAL_unit* alloc_NAL_unit(int size); +}; + + +#endif diff --git a/libde265/nal.h b/libde265/nal.h new file mode 100644 index 0000000..2bd85db --- /dev/null +++ b/libde265/nal.h @@ -0,0 +1,129 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef DE265_NAL_H +#define DE265_NAL_H + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#ifdef HAVE_STDBOOL_H +#include +#endif + +#include "libde265/bitstream.h" +#include "libde265/cabac.h" + +struct nal_header { + nal_header() { + nal_unit_type = 0; + nuh_layer_id = 0; + nuh_temporal_id = 0; + } + + void read(bitreader* reader); + void write(CABAC_encoder& writer) const; + + void set(int unit_type, int layer_id=0, int temporal_id=0) { + nal_unit_type =unit_type; + nuh_layer_id =layer_id; + nuh_temporal_id=temporal_id; + } + + uint8_t nal_unit_type; + uint8_t nuh_layer_id; + uint8_t nuh_temporal_id; +}; + +#define NAL_UNIT_TRAIL_N 0 +#define NAL_UNIT_TRAIL_R 1 +#define NAL_UNIT_TSA_N 2 +#define NAL_UNIT_TSA_R 3 +#define NAL_UNIT_STSA_N 4 +#define NAL_UNIT_STSA_R 5 +#define NAL_UNIT_RADL_N 6 +#define NAL_UNIT_RADL_R 7 +#define NAL_UNIT_RASL_N 8 +#define NAL_UNIT_RASL_R 9 +#define NAL_UNIT_RESERVED_VCL_N10 10 +#define NAL_UNIT_RESERVED_VCL_N12 12 +#define NAL_UNIT_RESERVED_VCL_N14 14 +#define NAL_UNIT_RESERVED_VCL_R11 11 +#define NAL_UNIT_RESERVED_VCL_R13 13 +#define NAL_UNIT_RESERVED_VCL_R15 15 +#define NAL_UNIT_BLA_W_LP 16 // BLA = broken link access +#define NAL_UNIT_BLA_W_RADL 17 +#define NAL_UNIT_BLA_N_LP 18 +#define NAL_UNIT_IDR_W_RADL 19 +#define NAL_UNIT_IDR_N_LP 20 +#define NAL_UNIT_CRA_NUT 21 // CRA = clean random access +#define NAL_UNIT_RESERVED_IRAP_VCL22 22 +#define NAL_UNIT_RESERVED_IRAP_VCL23 23 +#define NAL_UNIT_RESERVED_VCL24 24 +#define NAL_UNIT_RESERVED_VCL25 25 +#define NAL_UNIT_RESERVED_VCL26 26 +#define NAL_UNIT_RESERVED_VCL27 27 +#define NAL_UNIT_RESERVED_VCL28 28 +#define NAL_UNIT_RESERVED_VCL29 29 +#define NAL_UNIT_RESERVED_VCL30 30 +#define NAL_UNIT_RESERVED_VCL31 31 +#define NAL_UNIT_VPS_NUT 32 +#define NAL_UNIT_SPS_NUT 33 +#define NAL_UNIT_PPS_NUT 34 +#define NAL_UNIT_AUD_NUT 35 +#define NAL_UNIT_EOS_NUT 36 +#define NAL_UNIT_EOB_NUT 37 +#define NAL_UNIT_FD_NUT 38 +#define NAL_UNIT_PREFIX_SEI_NUT 39 +#define NAL_UNIT_SUFFIX_SEI_NUT 40 +#define NAL_UNIT_RESERVED_NVCL41 41 +#define NAL_UNIT_RESERVED_NVCL42 42 +#define NAL_UNIT_RESERVED_NVCL43 43 +#define NAL_UNIT_RESERVED_NVCL44 44 +#define NAL_UNIT_RESERVED_NVCL45 45 +#define NAL_UNIT_RESERVED_NVCL46 46 +#define NAL_UNIT_RESERVED_NVCL47 47 + +#define NAL_UNIT_UNDEFINED 255 + +bool isIDR(uint8_t unit_type); +bool isBLA(uint8_t unit_type); +bool isCRA(uint8_t unit_type); +bool isRAP(uint8_t unit_type); +bool isRASL(uint8_t unit_type); +bool isIRAP(uint8_t unit_type); +bool isRADL(uint8_t unit_type); +bool isReferenceNALU(uint8_t unit_type); +bool isSublayerNonReference(uint8_t unit_type); + +const char* get_NAL_name(uint8_t unit_type); + +inline bool isIdrPic(uint8_t nal_unit_type) { + return (nal_unit_type == NAL_UNIT_IDR_W_RADL || + nal_unit_type == NAL_UNIT_IDR_N_LP); +} + +inline bool isRapPic(uint8_t nal_unit_type) { + return nal_unit_type >= 16 && nal_unit_type <= 23; +} + +#endif diff --git a/libde265/pps.h b/libde265/pps.h new file mode 100644 index 0000000..81ff1f6 --- /dev/null +++ b/libde265/pps.h @@ -0,0 +1,163 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef DE265_PPS_H +#define DE265_PPS_H + +#include "libde265/bitstream.h" +#include "libde265/sps.h" // for scaling list only + +#include +#include + +#define DE265_MAX_TILE_COLUMNS 10 +#define DE265_MAX_TILE_ROWS 10 + +class decoder_context; +class pic_parameter_set; + + +class pps_range_extension +{ + public: + pps_range_extension() { reset(); } + + void reset(); + + bool read(bitreader*, decoder_context*, const pic_parameter_set*); + void dump(int fd) const; + + uint8_t log2_max_transform_skip_block_size; + bool cross_component_prediction_enabled_flag; + bool chroma_qp_offset_list_enabled_flag; + uint8_t diff_cu_chroma_qp_offset_depth; + uint8_t chroma_qp_offset_list_len; + int8_t cb_qp_offset_list[6]; + int8_t cr_qp_offset_list[6]; + uint8_t log2_sao_offset_scale_luma; + uint8_t log2_sao_offset_scale_chroma; +}; + + +class pic_parameter_set { +public: + pic_parameter_set(); + ~pic_parameter_set(); + + void reset() { set_defaults(); } + bool read(bitreader*, decoder_context*); + bool write(error_queue*, CABAC_encoder&, + const seq_parameter_set* sps); + + bool is_tile_start_CTB(int ctbX,int ctbY) const; + void dump(int fd) const; + + + void set_defaults(enum PresetSet = Preset_Default); + + bool pps_read; // whether this pps has been read from bitstream + std::shared_ptr sps; + + + char pic_parameter_set_id; + char seq_parameter_set_id; + char dependent_slice_segments_enabled_flag; + char sign_data_hiding_flag; + char cabac_init_present_flag; + char num_ref_idx_l0_default_active; // [1;16] + char num_ref_idx_l1_default_active; // [1;16] + + int pic_init_qp; + char constrained_intra_pred_flag; + char transform_skip_enabled_flag; + + // --- QP --- + + char cu_qp_delta_enabled_flag; + int diff_cu_qp_delta_depth; // [ 0 ; log2_diff_max_min_luma_coding_block_size ] + + int pic_cb_qp_offset; + int pic_cr_qp_offset; + char pps_slice_chroma_qp_offsets_present_flag; + + + char weighted_pred_flag; + char weighted_bipred_flag; + char output_flag_present_flag; + char transquant_bypass_enable_flag; + char entropy_coding_sync_enabled_flag; + + + // --- tiles --- + + char tiles_enabled_flag; + int num_tile_columns; // [1;PicWidthInCtbsY] + int num_tile_rows; // [1;PicHeightInCtbsY] + char uniform_spacing_flag; + + + // --- --- + + char loop_filter_across_tiles_enabled_flag; + char pps_loop_filter_across_slices_enabled_flag; + char deblocking_filter_control_present_flag; + + char deblocking_filter_override_enabled_flag; + char pic_disable_deblocking_filter_flag; + + int beta_offset; + int tc_offset; + + char pic_scaling_list_data_present_flag; + struct scaling_list_data scaling_list; // contains valid data if sps->scaling_list_enabled_flag set + + char lists_modification_present_flag; + int log2_parallel_merge_level; // [2 ; log2(max CB size)] + char num_extra_slice_header_bits; + char slice_segment_header_extension_present_flag; + char pps_extension_flag; + char pps_range_extension_flag; + char pps_multilayer_extension_flag; + char pps_extension_6bits; + + pps_range_extension range_extension; + + + // --- derived values --- + + int Log2MinCuQpDeltaSize; + int Log2MinCuChromaQpOffsetSize; + int Log2MaxTransformSkipSize; + + int colWidth [ DE265_MAX_TILE_COLUMNS ]; + int rowHeight[ DE265_MAX_TILE_ROWS ]; + int colBd [ DE265_MAX_TILE_COLUMNS+1 ]; + int rowBd [ DE265_MAX_TILE_ROWS+1 ]; + + std::vector CtbAddrRStoTS; // #CTBs + std::vector CtbAddrTStoRS; // #CTBs + std::vector TileId; // #CTBs // index in tile-scan order + std::vector TileIdRS; // #CTBs // index in raster-scan order + std::vector MinTbAddrZS; // #TBs [x + y*PicWidthInTbsY] + + void set_derived_values(const seq_parameter_set* sps); +}; + +#endif diff --git a/libde265/quality.h b/libde265/quality.h new file mode 100644 index 0000000..7073d14 --- /dev/null +++ b/libde265/quality.h @@ -0,0 +1,47 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef DE265_QUALITY_H +#define DE265_QUALITY_H + +#include +#include +#include + + +LIBDE265_API uint32_t SSD(const uint8_t* img, int imgStride, + const uint8_t* ref, int refStride, + int width, int height); + +LIBDE265_API uint32_t SAD(const uint8_t* img, int imgStride, + const uint8_t* ref, int refStride, + int width, int height); + +LIBDE265_API double MSE(const uint8_t* img, int imgStride, + const uint8_t* ref, int refStride, + int width, int height); + +LIBDE265_API double PSNR(double mse); + + +LIBDE265_API uint32_t compute_distortion_ssd(const de265_image* img1, const de265_image* img2, + int x0, int y0, int log2size, int cIdx); + +#endif diff --git a/libde265/refpic.h b/libde265/refpic.h new file mode 100644 index 0000000..2904197 --- /dev/null +++ b/libde265/refpic.h @@ -0,0 +1,61 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef DE265_REFPIC_H +#define DE265_REFPIC_H + +#include "libde265/bitstream.h" + +#define MAX_NUM_REF_PICS 16 // maximum defined by standard, may be lower for some Levels + + +class ref_pic_set +{ + public: + // Lists of pictures that have to be kept in the decoded picture buffer for future + // reference and that may optionally be used for prediction in the current frame. + // Lists contain the relative POC positions. + int16_t DeltaPocS0[MAX_NUM_REF_PICS]; // sorted in decreasing order (e.g. -1, -2, -4, -7, ...) + int16_t DeltaPocS1[MAX_NUM_REF_PICS]; // sorted in ascending order (e.g. 1, 2, 4, 7) + + // flag for each reference whether this is actually used for prediction in the current frame + uint8_t UsedByCurrPicS0[MAX_NUM_REF_PICS]; + uint8_t UsedByCurrPicS1[MAX_NUM_REF_PICS]; + + uint8_t NumNegativePics; // number of past reference pictures + uint8_t NumPositivePics; // number of future reference pictures + + // --- derived values --- + + void compute_derived_values(); + + uint8_t NumDeltaPocs; // total number of reference pictures (past + future) + + uint8_t NumPocTotalCurr_shortterm_only; /* Total number of reference pictures that may actually + be used for prediction in the current frame. */ + + void reset(); +}; + + +void dump_short_term_ref_pic_set(const ref_pic_set*, FILE* fh); +void dump_compact_short_term_ref_pic_set(const ref_pic_set* set, int range, FILE* fh); + +#endif diff --git a/libde265/sao.h b/libde265/sao.h new file mode 100644 index 0000000..bb9e08c --- /dev/null +++ b/libde265/sao.h @@ -0,0 +1,36 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef DE265_SAO_H +#define DE265_SAO_H + +#include "libde265/decctx.h" + +void apply_sample_adaptive_offset(de265_image* img); + +/* requires less memory than the function above */ +void apply_sample_adaptive_offset_sequential(de265_image* img); + +/* saoInputProgress - the CTB progress that SAO will wait for before beginning processing. + Returns 'true' if any tasks have been added. + */ +bool add_sao_tasks(image_unit* imgunit, int saoInputProgress); + +#endif diff --git a/libde265/scan.h b/libde265/scan.h new file mode 100644 index 0000000..7a8b977 --- /dev/null +++ b/libde265/scan.h @@ -0,0 +1,43 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef DE265_SCAN_H +#define DE265_SCAN_H + +#include + +typedef struct { + uint8_t x,y; +} position; + +typedef struct { + uint8_t subBlock; + uint8_t scanPos; +} scan_position; + +void init_scan_orders(); + +/* scanIdx: 0 - diag, 1 - horiz, 2 - verti + */ +const position* get_scan_order(int log2BlockSize, int scanIdx); + +scan_position get_scan_position(int x,int y, int scanIdx, int log2BlkSize); + +#endif diff --git a/libde265/sei.h b/libde265/sei.h new file mode 100644 index 0000000..fd615d5 --- /dev/null +++ b/libde265/sei.h @@ -0,0 +1,89 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef DE265_SEI_H +#define DE265_SEI_H + +#include "libde265/bitstream.h" +#include "libde265/de265.h" + + +enum sei_payload_type { + sei_payload_type_buffering_period = 0, + sei_payload_type_pic_timing = 1, + sei_payload_type_pan_scan_rect = 2, + sei_payload_type_filler_payload = 3, + sei_payload_type_user_data_registered_itu_t_t35 = 4, + sei_payload_type_user_data_unregistered = 5, + sei_payload_type_recovery_point = 6, + sei_payload_type_scene_info = 9, + sei_payload_type_picture_snapshot = 15, + sei_payload_type_progressive_refinement_segment_start = 16, + sei_payload_type_progressive_refinement_segment_end = 17, + sei_payload_type_film_grain_characteristics = 19, + sei_payload_type_post_filter_hint = 22, + sei_payload_type_tone_mapping_info = 23, + sei_payload_type_frame_packing_arrangement = 45, + sei_payload_type_display_orientation = 47, + sei_payload_type_structure_of_pictures_info = 128, + sei_payload_type_active_parameter_sets = 129, + sei_payload_type_decoding_unit_info = 130, + sei_payload_type_temporal_sub_layer_zero_index = 131, + sei_payload_type_decoded_picture_hash = 132, + sei_payload_type_scalable_nesting = 133, + sei_payload_type_region_refresh_info = 134, + sei_payload_type_no_display = 135, + sei_payload_type_motion_constrained_tile_sets = 136 +}; + + +enum sei_decoded_picture_hash_type { + sei_decoded_picture_hash_type_MD5 = 0, + sei_decoded_picture_hash_type_CRC = 1, + sei_decoded_picture_hash_type_checksum = 2 +}; + + +typedef struct { + enum sei_decoded_picture_hash_type hash_type; + uint8_t md5[3][16]; + uint16_t crc[3]; + uint32_t checksum[3]; +} sei_decoded_picture_hash; + + +typedef struct { + enum sei_payload_type payload_type; + int payload_size; + + union { + sei_decoded_picture_hash decoded_picture_hash; + } data; +} sei_message; + +class seq_parameter_set; + +const char* sei_type_name(enum sei_payload_type type); + +de265_error read_sei(bitreader* reader, sei_message*, bool suffix, const seq_parameter_set* sps); +void dump_sei(const sei_message*, const seq_parameter_set* sps); +de265_error process_sei(const sei_message*, struct de265_image* img); + +#endif diff --git a/libde265/slice.h b/libde265/slice.h new file mode 100644 index 0000000..0f476f2 --- /dev/null +++ b/libde265/slice.h @@ -0,0 +1,313 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * Authors: struktur AG, Dirk Farin + * Min Chen + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef DE265_SLICE_H +#define DE265_SLICE_H + +#include "libde265/cabac.h" +#include "libde265/de265.h" +#include "libde265/util.h" +#include "libde265/refpic.h" +#include "libde265/threads.h" +#include "contextmodel.h" + +#include +#include +#include + +#define MAX_NUM_REF_PICS 16 + +class decoder_context; +class thread_context; +class error_queue; +class seq_parameter_set; +class pic_parameter_set; + +enum SliceType + { + SLICE_TYPE_B = 0, + SLICE_TYPE_P = 1, + SLICE_TYPE_I = 2 + }; + +/* + 2Nx2N 2NxN Nx2N NxN + +-------+ +-------+ +---+---+ +---+---+ + | | | | | | | | | | + | | |_______| | | | |___|___| + | | | | | | | | | | + | | | | | | | | | | + +-------+ +-------+ +---+---+ +---+---+ + + 2NxnU 2NxnD nLx2N nRx2N + +-------+ +-------+ +-+-----+ +-----+-+ + |_______| | | | | | | | | + | | | | | | | | | | + | | |_______| | | | | | | + | | | | | | | | | | + +-------+ +-------+ +-+-----+ +-----+-+ + + - AMP only if CU size > min CU size -> minimum PU size = CUsize/2 + - NxN only if size >= 16x16 (-> minimum block size = 8x8) + - minimum block size for Bi-Pred is 8x8 (wikipedia: Coding_tree_unit) +*/ +enum PartMode + { + PART_2Nx2N = 0, + PART_2NxN = 1, + PART_Nx2N = 2, + PART_NxN = 3, + PART_2NxnU = 4, + PART_2NxnD = 5, + PART_nLx2N = 6, + PART_nRx2N = 7 + }; + +const char* part_mode_name(enum PartMode); + + +enum PredMode + { + MODE_INTRA, MODE_INTER, MODE_SKIP + }; + +enum IntraPredMode + { + INTRA_PLANAR = 0, + INTRA_DC = 1, + INTRA_ANGULAR_2 = 2, INTRA_ANGULAR_3 = 3, INTRA_ANGULAR_4 = 4, INTRA_ANGULAR_5 = 5, + INTRA_ANGULAR_6 = 6, INTRA_ANGULAR_7 = 7, INTRA_ANGULAR_8 = 8, INTRA_ANGULAR_9 = 9, + INTRA_ANGULAR_10 = 10, INTRA_ANGULAR_11 = 11, INTRA_ANGULAR_12 = 12, INTRA_ANGULAR_13 = 13, + INTRA_ANGULAR_14 = 14, INTRA_ANGULAR_15 = 15, INTRA_ANGULAR_16 = 16, INTRA_ANGULAR_17 = 17, + INTRA_ANGULAR_18 = 18, INTRA_ANGULAR_19 = 19, INTRA_ANGULAR_20 = 20, INTRA_ANGULAR_21 = 21, + INTRA_ANGULAR_22 = 22, INTRA_ANGULAR_23 = 23, INTRA_ANGULAR_24 = 24, INTRA_ANGULAR_25 = 25, + INTRA_ANGULAR_26 = 26, INTRA_ANGULAR_27 = 27, INTRA_ANGULAR_28 = 28, INTRA_ANGULAR_29 = 29, + INTRA_ANGULAR_30 = 30, INTRA_ANGULAR_31 = 31, INTRA_ANGULAR_32 = 32, INTRA_ANGULAR_33 = 33, + INTRA_ANGULAR_34 = 34 + }; + + +enum IntraChromaPredMode + { + INTRA_CHROMA_PLANAR_OR_34 = 0, + INTRA_CHROMA_ANGULAR_26_OR_34 = 1, + INTRA_CHROMA_ANGULAR_10_OR_34 = 2, + INTRA_CHROMA_DC_OR_34 = 3, + INTRA_CHROMA_LIKE_LUMA = 4 + }; + + +enum InterPredIdc + { + // note: values have to match the decoding function decode_inter_pred_idc() + PRED_L0=1, + PRED_L1=2, + PRED_BI=3 + }; + + + +class slice_segment_header { +public: + slice_segment_header() { + reset(); + } + + de265_error read(bitreader* br, decoder_context*, bool* continueDecoding); + de265_error write(error_queue*, CABAC_encoder&, + const seq_parameter_set* sps, + const pic_parameter_set* pps, + uint8_t nal_unit_type); + + void dump_slice_segment_header(const decoder_context*, int fd) const; + + void set_defaults(); + void reset(); + + + int slice_index; // index through all slices in a picture (internal only) + std::shared_ptr pps; + + + char first_slice_segment_in_pic_flag; + char no_output_of_prior_pics_flag; + int slice_pic_parameter_set_id; + char dependent_slice_segment_flag; + int slice_segment_address; + + int slice_type; + char pic_output_flag; + char colour_plane_id; + int slice_pic_order_cnt_lsb; + char short_term_ref_pic_set_sps_flag; + ref_pic_set slice_ref_pic_set; + + int short_term_ref_pic_set_idx; + int num_long_term_sps; + int num_long_term_pics; + + uint8_t lt_idx_sps[MAX_NUM_REF_PICS]; + int poc_lsb_lt[MAX_NUM_REF_PICS]; + char used_by_curr_pic_lt_flag[MAX_NUM_REF_PICS]; + + char delta_poc_msb_present_flag[MAX_NUM_REF_PICS]; + int delta_poc_msb_cycle_lt[MAX_NUM_REF_PICS]; + + char slice_temporal_mvp_enabled_flag; + char slice_sao_luma_flag; + char slice_sao_chroma_flag; + + char num_ref_idx_active_override_flag; + int num_ref_idx_l0_active; // [1;16] + int num_ref_idx_l1_active; // [1;16] + + char ref_pic_list_modification_flag_l0; + char ref_pic_list_modification_flag_l1; + uint8_t list_entry_l0[16]; + uint8_t list_entry_l1[16]; + + char mvd_l1_zero_flag; + char cabac_init_flag; + char collocated_from_l0_flag; + int collocated_ref_idx; + + // --- pred_weight_table --- + + uint8_t luma_log2_weight_denom; // [0;7] + uint8_t ChromaLog2WeightDenom; // [0;7] + + // first index is L0/L1 + uint8_t luma_weight_flag[2][16]; // bool + uint8_t chroma_weight_flag[2][16]; // bool + int16_t LumaWeight[2][16]; + int8_t luma_offset[2][16]; + int16_t ChromaWeight[2][16][2]; + int8_t ChromaOffset[2][16][2]; + + + int five_minus_max_num_merge_cand; + int slice_qp_delta; + + int slice_cb_qp_offset; + int slice_cr_qp_offset; + + char cu_chroma_qp_offset_enabled_flag; + + char deblocking_filter_override_flag; + char slice_deblocking_filter_disabled_flag; + int slice_beta_offset; // = pps->beta_offset if undefined + int slice_tc_offset; // = pps->tc_offset if undefined + + char slice_loop_filter_across_slices_enabled_flag; + + int num_entry_point_offsets; + int offset_len; + std::vector entry_point_offset; + + int slice_segment_header_extension_length; + + + // --- derived data --- + + int SliceQPY; + int initType; + + void compute_derived_values(const pic_parameter_set* pps); + + + // --- data for external modules --- + + int SliceAddrRS; // slice_segment_address of last independent slice + + int MaxNumMergeCand; // directly derived from 'five_minus_max_num_merge_cand' + int CurrRpsIdx; + ref_pic_set CurrRps; // the active reference-picture set + int NumPocTotalCurr; + + // number of entries: num_ref_idx_l0_active / num_ref_idx_l1_active + int RefPicList[2][MAX_NUM_REF_PICS]; // contains buffer IDs (D:indices into DPB/E:frame number) + int RefPicList_POC[2][MAX_NUM_REF_PICS]; + int RefPicList_PicState[2][MAX_NUM_REF_PICS]; /* We have to save the PicState because the decoding + of an image may be delayed and the PicState can + change in the mean-time (e.g. from ShortTerm to + LongTerm). PicState is used in motion.cc */ + + char LongTermRefPic[2][MAX_NUM_REF_PICS]; /* Flag whether the picture at this ref-pic-list + is a long-term picture. */ + + // context storage for dependent slices (stores CABAC model at end of slice segment) + context_model_table ctx_model_storage; + bool ctx_model_storage_defined; // whether there is valid data in ctx_model_storage + + std::vector RemoveReferencesList; // images that can be removed from the DPB before decoding this slice + +}; + + + +typedef struct { + // TODO: we could combine SaoTypeIdx and SaoEoClass into one byte to make the struct 16 bytes only + + unsigned char SaoTypeIdx; // use with (SaoTypeIdx>>(2*cIdx)) & 0x3 + unsigned char SaoEoClass; // use with (SaoTypeIdx>>(2*cIdx)) & 0x3 + + uint8_t sao_band_position[3]; + int8_t saoOffsetVal[3][4]; // index with [][idx-1] as saoOffsetVal[][0]==0 always +} sao_info; + + + + +de265_error read_slice_segment_data(thread_context* tctx); + +bool alloc_and_init_significant_coeff_ctxIdx_lookupTable(); +void free_significant_coeff_ctxIdx_lookupTable(); + + +class thread_task_ctb_row : public thread_task +{ +public: + bool firstSliceSubstream; + int debug_startCtbRow; + thread_context* tctx; + + virtual void work(); + virtual std::string name() const; +}; + +class thread_task_slice_segment : public thread_task +{ +public: + bool firstSliceSubstream; + int debug_startCtbX, debug_startCtbY; + thread_context* tctx; + + virtual void work(); + virtual std::string name() const; +}; + + +int check_CTB_available(const de265_image* img, + int xC,int yC, int xN,int yN); + +#endif diff --git a/libde265/sps.h b/libde265/sps.h new file mode 100644 index 0000000..b06151d --- /dev/null +++ b/libde265/sps.h @@ -0,0 +1,257 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef DE265_SPS_H +#define DE265_SPS_H + +#include "libde265/vps.h" +#include "libde265/vui.h" +#include "libde265/bitstream.h" +#include "libde265/refpic.h" +#include "libde265/de265.h" +#include "libde265/cabac.h" + +#include + +class error_queue; + +// #define MAX_REF_PIC_SETS 64 // maximum according to standard +#define MAX_NUM_LT_REF_PICS_SPS 32 + +// this is just a safety range +#define MAX_PICTURE_WIDTH 70000 +#define MAX_PICTURE_HEIGHT 70000 + +enum { + CHROMA_MONO = 0, + CHROMA_420 = 1, + CHROMA_422 = 2, + CHROMA_444 = 3, + CHROMA_444_SEPARATE +}; + + +typedef struct scaling_list_data { + // structure size: approx. 4 kB + + uint8_t ScalingFactor_Size0[6][4][4]; + uint8_t ScalingFactor_Size1[6][8][8]; + uint8_t ScalingFactor_Size2[6][16][16]; + uint8_t ScalingFactor_Size3[2][32][32]; +} scaling_list_data; + + +enum PresetSet { + Preset_Default +}; + + +class sps_range_extension +{ + public: + sps_range_extension(); + + uint8_t transform_skip_rotation_enabled_flag; + uint8_t transform_skip_context_enabled_flag; + uint8_t implicit_rdpcm_enabled_flag; + uint8_t explicit_rdpcm_enabled_flag; + uint8_t extended_precision_processing_flag; + uint8_t intra_smoothing_disabled_flag; + uint8_t high_precision_offsets_enabled_flag; + uint8_t persistent_rice_adaptation_enabled_flag; + uint8_t cabac_bypass_alignment_enabled_flag; + + de265_error read(error_queue*, bitreader*); + void dump(int fd) const; +}; + + +class seq_parameter_set { +public: + seq_parameter_set(); + ~seq_parameter_set(); + + de265_error read(error_queue*, bitreader*); + de265_error write(error_queue*, CABAC_encoder&); + + void dump(int fd) const; + + void set_defaults(enum PresetSet = Preset_Default); + void set_CB_log2size_range(int mini,int maxi); + void set_TB_log2size_range(int mini,int maxi); + void set_resolution(int w,int h); + + bool sps_read; // whether the sps has been read from the bitstream + + + char video_parameter_set_id; + char sps_max_sub_layers; // [1;7] + char sps_temporal_id_nesting_flag; + + profile_tier_level profile_tier_level_; + + int seq_parameter_set_id; + int chroma_format_idc; + + char separate_colour_plane_flag; + int pic_width_in_luma_samples; + int pic_height_in_luma_samples; + char conformance_window_flag; + + int conf_win_left_offset; + int conf_win_right_offset; + int conf_win_top_offset; + int conf_win_bottom_offset; + + int bit_depth_luma; + int bit_depth_chroma; + + int log2_max_pic_order_cnt_lsb; + char sps_sub_layer_ordering_info_present_flag; + + int sps_max_dec_pic_buffering[7]; // for each temporal layer + int sps_max_num_reorder_pics[7]; + int sps_max_latency_increase_plus1[7]; + + int log2_min_luma_coding_block_size; // smallest CB size [3;6] + int log2_diff_max_min_luma_coding_block_size; // largest CB size + int log2_min_transform_block_size; // smallest TB size [2;5] + int log2_diff_max_min_transform_block_size; // largest TB size + int max_transform_hierarchy_depth_inter; + int max_transform_hierarchy_depth_intra; + + char scaling_list_enable_flag; + char sps_scaling_list_data_present_flag; /* if not set, the default scaling lists will be set + in scaling_list */ + + struct scaling_list_data scaling_list; + + char amp_enabled_flag; + char sample_adaptive_offset_enabled_flag; + char pcm_enabled_flag; + + char pcm_sample_bit_depth_luma; + char pcm_sample_bit_depth_chroma; + int log2_min_pcm_luma_coding_block_size; + int log2_diff_max_min_pcm_luma_coding_block_size; + char pcm_loop_filter_disable_flag; + + int num_short_term_ref_pic_sets() const { return ref_pic_sets.size(); } + std::vector ref_pic_sets; // [0 ; num_short_term_ref_pic_set (<=MAX_REF_PIC_SETS) ) + + char long_term_ref_pics_present_flag; + + int num_long_term_ref_pics_sps; + + int lt_ref_pic_poc_lsb_sps[MAX_NUM_LT_REF_PICS_SPS]; + char used_by_curr_pic_lt_sps_flag[MAX_NUM_LT_REF_PICS_SPS]; + + char sps_temporal_mvp_enabled_flag; + char strong_intra_smoothing_enable_flag; + + char vui_parameters_present_flag; + video_usability_information vui; + + char sps_extension_present_flag; + char sps_range_extension_flag; + char sps_multilayer_extension_flag; + char sps_extension_6bits; + + sps_range_extension range_extension; + + /* + if( sps_extension_flag ) + while( more_rbsp_data() ) + sps_extension_data_flag + u(1) + rbsp_trailing_bits() + */ + + + // --- derived values --- + + de265_error compute_derived_values(bool sanitize_values = false); + + int BitDepth_Y; + int QpBdOffset_Y; + int BitDepth_C; + int QpBdOffset_C; + + int ChromaArrayType; + int SubWidthC, SubHeightC; + int WinUnitX, WinUnitY; + + int MaxPicOrderCntLsb; + + int Log2MinCbSizeY; + int Log2CtbSizeY; + int MinCbSizeY; + int CtbSizeY; + int PicWidthInMinCbsY; + int PicWidthInCtbsY; + int PicHeightInMinCbsY; + int PicHeightInCtbsY; + int PicSizeInMinCbsY; + int PicSizeInCtbsY; + int PicSizeInSamplesY; + + int CtbWidthC, CtbHeightC; + + int PicWidthInTbsY; // not in standard + int PicHeightInTbsY; // not in standard + int PicSizeInTbsY; // not in standard + + int Log2MinTrafoSize; + int Log2MaxTrafoSize; + + int Log2MinPUSize; + int PicWidthInMinPUs; // might be rounded up + int PicHeightInMinPUs; // might be rounded up + + int Log2MinIpcmCbSizeY; + int Log2MaxIpcmCbSizeY; + + int SpsMaxLatencyPictures[7]; // [temporal layer] + + uint8_t WpOffsetBdShiftY; + uint8_t WpOffsetBdShiftC; + int32_t WpOffsetHalfRangeY; + int32_t WpOffsetHalfRangeC; + + + int getPUIndexRS(int pixelX,int pixelY) const { + return (pixelX>>Log2MinPUSize) + (pixelY>>Log2MinPUSize)*PicWidthInMinPUs; + } + + int get_bit_depth(int cIdx) const { + if (cIdx==0) return BitDepth_Y; + else return BitDepth_C; + } + + int get_chroma_shift_W(int cIdx) const { return cIdx ? SubWidthC -1 : 0; } + int get_chroma_shift_H(int cIdx) const { return cIdx ? SubHeightC-1 : 0; } +}; + +de265_error read_scaling_list(bitreader*, const seq_parameter_set*, scaling_list_data*, bool inPPS); +de265_error write_scaling_list(CABAC_encoder& out, const seq_parameter_set* sps, + scaling_list_data* sclist, bool inPPS); +void set_default_scaling_lists(scaling_list_data*); + +#endif diff --git a/libde265/threads.h b/libde265/threads.h new file mode 100644 index 0000000..2c743bc --- /dev/null +++ b/libde265/threads.h @@ -0,0 +1,148 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef DE265_THREADS_H +#define DE265_THREADS_H + +#include "libde265/de265.h" + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#ifdef HAVE_STDBOOL_H +#include +#endif + +#include +#include +#include + +#ifndef _WIN32 +#include + +typedef pthread_t de265_thread; +typedef pthread_mutex_t de265_mutex; +typedef pthread_cond_t de265_cond; + +#else // _WIN32 +#if !defined(NOMINMAX) +#define NOMINMAX 1 +#endif +#include +#include "../extra/win32cond.h" +#if _MSC_VER > 1310 +#include +#endif + +typedef HANDLE de265_thread; +typedef HANDLE de265_mutex; +typedef win32_cond_t de265_cond; +#endif // _WIN32 + +#ifndef _WIN32 +int de265_thread_create(de265_thread* t, void *(*start_routine) (void *), void *arg); +#else +int de265_thread_create(de265_thread* t, LPTHREAD_START_ROUTINE start_routine, void *arg); +#endif +void de265_thread_join(de265_thread t); +void de265_thread_destroy(de265_thread* t); +void de265_mutex_init(de265_mutex* m); +void de265_mutex_destroy(de265_mutex* m); +void de265_mutex_lock(de265_mutex* m); +void de265_mutex_unlock(de265_mutex* m); +void de265_cond_init(de265_cond* c); +void de265_cond_destroy(de265_cond* c); +void de265_cond_broadcast(de265_cond* c, de265_mutex* m); +void de265_cond_wait(de265_cond* c,de265_mutex* m); +void de265_cond_signal(de265_cond* c); + + +class de265_progress_lock +{ +public: + de265_progress_lock(); + ~de265_progress_lock(); + + void wait_for_progress(int progress); + void set_progress(int progress); + void increase_progress(int progress); + int get_progress() const; + void reset(int value=0) { mProgress=value; } + +private: + int mProgress; + + // private data + + de265_mutex mutex; + de265_cond cond; +}; + + + +class thread_task +{ +public: + thread_task() : state(Queued) { } + virtual ~thread_task() { } + + enum { Queued, Running, Blocked, Finished } state; + + virtual void work() = 0; + + virtual std::string name() const { return "noname"; } +}; + + +#define MAX_THREADS 32 + +/* TODO NOTE: When unblocking a task, we have to check first + if there are threads waiting because of the run-count limit. + If there are higher-priority tasks, those should be run instead + of the just unblocked task. + */ + +class thread_pool +{ + public: + bool stopped; + + std::deque tasks; // we are not the owner + + de265_thread thread[MAX_THREADS]; + int num_threads; + + int num_threads_working; + + int ctbx[MAX_THREADS]; // the CTB the thread is working on + int ctby[MAX_THREADS]; + + de265_mutex mutex; + de265_cond cond_var; +}; + + +de265_error start_thread_pool(thread_pool* pool, int num_threads); +void stop_thread_pool(thread_pool* pool); // do not process remaining tasks + +void add_task(thread_pool* pool, thread_task* task); // TOCO: can make thread_task const + +#endif diff --git a/libde265/transform.h b/libde265/transform.h new file mode 100644 index 0000000..6f19049 --- /dev/null +++ b/libde265/transform.h @@ -0,0 +1,65 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef DE265_TRANSFORM_H +#define DE265_TRANSFORM_H + +#include "libde265/de265.h" +#include "libde265/decctx.h" + +extern const int tab8_22[]; + +LIBDE265_INLINE static int table8_22(int qPi) +{ + if (qPi<30) return qPi; + if (qPi>=43) return qPi-6; + return tab8_22[qPi-30]; +} + +// (8.6.1) +void decode_quantization_parameters(thread_context* tctx, int xC,int yC, + int xCUBase, int yCUBase); + +// (8.6.2) +void scale_coefficients(thread_context* tctx, + int xT,int yT, // position of TU in frame (chroma adapted) + int x0,int y0, // position of CU in frame (chroma adapted) + int nT, int cIdx, + bool transform_skip_flag, bool intra, int rdpcmMode); + + +void inv_transform(acceleration_functions* acceleration, + uint8_t* dst, int dstStride, int16_t* coeff, + int log2TbSize, int trType); + +void fwd_transform(acceleration_functions* acceleration, + int16_t* coeff, int coeffStride, int log2TbSize, int trType, + const int16_t* src, int srcStride); + +void quant_coefficients(int16_t* out_coeff, + const int16_t* in_coeff, + int log2TrSize, int qp, + bool intra); + +void dequant_coefficients(int16_t* out_coeff, + const int16_t* in_coeff, + int log2TrSize, int qP); + +#endif diff --git a/libde265/util.h b/libde265/util.h new file mode 100644 index 0000000..84d4d36 --- /dev/null +++ b/libde265/util.h @@ -0,0 +1,229 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef DE265_UTIL_H +#define DE265_UTIL_H + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#ifndef _MSC_VER +#include +#endif + +#include +#include + +#include "libde265/de265.h" + +#ifdef __GNUC__ +#define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) +#endif + +#ifdef _MSC_VER +#define LIBDE265_DECLARE_ALIGNED( var, n ) __declspec(align(n)) var +#define likely(x) (x) +#define unlikely(x) (x) +#else +#define LIBDE265_DECLARE_ALIGNED( var, n ) var __attribute__((aligned(n))) +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) +#endif + +#if defined(__GNUC__) && (__GNUC__ >= 4) +#define LIBDE265_CHECK_RESULT __attribute__ ((warn_unused_result)) +#elif defined(_MSC_VER) && (_MSC_VER >= 1700) +#define LIBDE265_CHECK_RESULT _Check_return_ +#else +#define LIBDE265_CHECK_RESULT +#endif + +// Be careful with these alignment instructions. They only specify the alignment within +// a struct. But they cannot make sure that the base address of the struct has the same alignment +// when it is dynamically allocated. +#define ALIGNED_32( var ) LIBDE265_DECLARE_ALIGNED( var, 32 ) +#define ALIGNED_16( var ) LIBDE265_DECLARE_ALIGNED( var, 16 ) +#define ALIGNED_8( var ) LIBDE265_DECLARE_ALIGNED( var, 8 ) +#define ALIGNED_4( var ) LIBDE265_DECLARE_ALIGNED( var, 4 ) + +// C++11 specific features +#if defined(_MSC_VER) || (!__clang__ && __GNUC__ && GCC_VERSION < 40600) +#define FOR_LOOP(type, var, list) for each (type var in list) +#undef FOR_LOOP_AUTO_SUPPORT +#else +#define FOR_LOOP(type, var, list) for (type var : list) +#define FOR_LOOP_AUTO_SUPPORT 1 +#endif + +#ifdef USE_STD_TR1_NAMESPACE +#include +namespace std { using namespace std::tr1; } +#endif + +#ifdef NEED_STD_MOVE_FALLBACK +// Provide fallback variant of "std::move" for older compilers with +// incomplete/broken C++11 support. +namespace std { + +template +inline typename std::remove_reference<_Tp>::type&& move(_Tp&& __t) { + return static_cast::type&&>(__t); +} + +} // namespace std +#endif + +#ifdef NEED_NULLPTR_FALLBACK +// Compilers with partial/incomplete support for C++11 don't know about +// "nullptr". A simple alias should be fine for our use case. +#define nullptr NULL +#endif + +#ifdef _MSC_VER + #ifdef _CPPRTTI + #define RTTI_ENABLED + #endif +#else + #ifdef __GXX_RTTI + #define RTTI_ENABLED + #endif +#endif + +//inline uint8_t Clip1_8bit(int16_t value) { if (value<=0) return 0; else if (value>=255) return 255; else return value; } +#define Clip1_8bit(value) ((value)<0 ? 0 : (value)>255 ? 255 : (value)) +#define Clip_BitDepth(value, bit_depth) ((value)<0 ? 0 : (value)>((1<(high) ? (high) : (value)) +#define Sign(value) (((value)<0) ? -1 : ((value)>0) ? 1 : 0) +#define abs_value(a) (((a)<0) ? -(a) : (a)) +#define libde265_min(a,b) (((a)<(b)) ? (a) : (b)) +#define libde265_max(a,b) (((a)>(b)) ? (a) : (b)) + +LIBDE265_INLINE static int ceil_div(int num,int denom) +{ + num += denom-1; + return num/denom; +} + +LIBDE265_INLINE static int ceil_log2(int val) +{ + int n=0; + while (val > (1<1) { + n++; + v>>=1; + } + + return n; +} + +LIBDE265_INLINE static int Log2SizeToArea(int v) +{ + return (1<<(v<<1)); +} + +void copy_subimage(uint8_t* dst,int dststride, + const uint8_t* src,int srcstride, + int w, int h); + + +// === logging === + +enum LogModule { + LogHighlevel, + LogHeaders, + LogSlice, + LogDPB, + LogMotion, + LogTransform, + LogDeblock, + LogSAO, + LogSEI, + LogIntraPred, + LogPixels, + LogSymbols, + LogCABAC, + LogEncoder, + LogEncoderMetadata, + NUMBER_OF_LogModules +}; + + +#if defined(DE265_LOG_ERROR) || defined(DE265_LOG_INFO) || defined(DE265_LOG_DEBUG) || defined(DE265_LOG_TRACE) +# define DE265_LOGGING 1 +void enable_logging(enum LogModule); +void disable_logging(enum LogModule); +#else +#define enable_logging(x) { } +#define disable_logging(x) { } +#endif + +#ifdef DE265_LOGGING +void log_set_current_POC(int poc); +#else +#define log_set_current_POC(poc) { } +#endif + +#ifdef DE265_LOG_ERROR +void logerror(enum LogModule module, const char* string, ...); +#else +#define logerror(a,b, ...) { } +#endif + +#ifdef DE265_LOG_INFO +void loginfo (enum LogModule module, const char* string, ...); +#else +#define loginfo(a,b, ...) { } +#endif + +#ifdef DE265_LOG_DEBUG +void logdebug(enum LogModule module, const char* string, ...); +bool logdebug_enabled(enum LogModule module); +#else +#define logdebug(a,b, ...) { } +inline bool logdebug_enabled(enum LogModule module) { return false; } +#endif + +#ifdef DE265_LOG_TRACE +void logtrace(enum LogModule module, const char* string, ...); +#else +#define logtrace(a,b, ...) { } +#endif + +void log2fh(FILE* fh, const char* string, ...); + + +void printBlk(const char* title,const int32_t* data, int blksize, int stride, const std::string& prefix=" "); +void printBlk(const char* title,const int16_t* data, int blksize, int stride, const std::string& prefix=" "); +void printBlk(const char* title,const uint8_t* data, int blksize, int stride, const std::string& prefix=" "); + +void debug_set_image_output(void (*)(const struct de265_image*, int slot)); +void debug_show_image(const struct de265_image*, int slot); + +#endif diff --git a/libde265/visualize.h b/libde265/visualize.h new file mode 100644 index 0000000..2cc0a5c --- /dev/null +++ b/libde265/visualize.h @@ -0,0 +1,50 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef DE265_VISUALIZE_H +#define DE265_VISUALIZE_H + +#include "libde265/de265.h" +#include "libde265/image.h" + + +void write_picture_to_file(const de265_image* img, const char* filename); + +#ifdef __cplusplus +extern "C" { +#endif + +// TODO: these should either move to "sherlock265", or be part of the +// "official" public API +LIBDE265_API void draw_CB_grid(const de265_image* img, uint8_t* dst, int stride, uint32_t value, int pixelSize); +LIBDE265_API void draw_TB_grid(const de265_image* img, uint8_t* dst, int stride, uint32_t value, int pixelSize); +LIBDE265_API void draw_PB_grid(const de265_image* img, uint8_t* dst, int stride, uint32_t value, int pixelSize); +LIBDE265_API void draw_PB_pred_modes(const de265_image* img, uint8_t* dst, int stride, int pixelSize); +LIBDE265_API void draw_intra_pred_modes(const de265_image* img, uint8_t* dst, int stride, uint32_t value, int pixelSize); +LIBDE265_API void draw_QuantPY(const de265_image* img, uint8_t* dst, int stride, int pixelSize); +LIBDE265_API void draw_Motion(const de265_image* img, uint8_t* dst, int stride, int pixelSize); +LIBDE265_API void draw_Slices(const de265_image* img, uint8_t* dst, int stride, int pixelSize); +LIBDE265_API void draw_Tiles(const de265_image* img, uint8_t* dst, int stride, int pixelSize); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/libde265/vps.h b/libde265/vps.h new file mode 100644 index 0000000..04c9c15 --- /dev/null +++ b/libde265/vps.h @@ -0,0 +1,173 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef DE265_VPS_H +#define DE265_VPS_H + +#ifdef HAVE_CONFIG_H +#include +#endif + +#ifdef HAVE_STDBOOL_H +#include +#endif + +#include "libde265/bitstream.h" +#include "libde265/de265.h" +#include "libde265/cabac.h" + +#include + +class error_queue; + +#define MAX_TEMPORAL_SUBLAYERS 8 + + +enum profile_idc { + Profile_Main = 1, + Profile_Main10 = 2, + Profile_MainStillPicture = 3, + Profile_FormatRangeExtensions = 4 +}; + + +class profile_data { +public: + void read(bitreader* reader); + void write(CABAC_encoder& writer) const; + void dump(bool general, FILE* fh) const; + + void set_defaults(enum profile_idc, int level_major, int level_minor); + + // --- profile --- + + char profile_present_flag; // always true for general profile + + char profile_space; // currently always 0 + char tier_flag; // main tier or low tier (see Table A-66/A-67) + enum profile_idc profile_idc; // profile + + char profile_compatibility_flag[32]; // to which profile we are compatible + + char progressive_source_flag; + char interlaced_source_flag; + char non_packed_constraint_flag; + char frame_only_constraint_flag; + + + // --- level --- + + char level_present_flag; // always true for general level + int level_idc; // level * 30 +}; + + +class profile_tier_level +{ +public: + void read(bitreader* reader, int max_sub_layers); + void write(CABAC_encoder& writer, int max_sub_layers) const; + void dump(int max_sub_layers, FILE* fh) const; + + profile_data general; + + //bool sub_layer_profile_present[MAX_TEMPORAL_SUBLAYERS]; + //bool sub_layer_level_present[MAX_TEMPORAL_SUBLAYERS]; + + profile_data sub_layer[MAX_TEMPORAL_SUBLAYERS]; +}; + + +/* +struct bit_rate_pic_rate_info { + char bit_rate_info_present_flag[8]; + char pic_rate_info_present_flag[8]; + + int avg_bit_rate[8]; + int max_bit_rate[8]; + + char constant_pic_rate_idc[8]; + int avg_pic_rate[8]; + +}; + +void read_bit_rate_pic_rate_info(bitreader* reader, + struct bit_rate_pic_rate_info* hdr, + int TempLevelLow, + int TempLevelHigh); + +void dump_bit_rate_pic_rate_info(struct bit_rate_pic_rate_info* hdr, + int TempLevelLow, + int TempLevelHigh); +*/ + + +typedef struct { + int vps_max_dec_pic_buffering; // [1 ; ] + int vps_max_num_reorder_pics; // [0 ; ] + int vps_max_latency_increase; // 0 -> no limit, otherwise value is (x-1) +} layer_data; + + +class video_parameter_set +{ +public: + de265_error read(error_queue* errqueue, bitreader* reader); + de265_error write(error_queue* errqueue, CABAC_encoder& out) const; + void dump(int fd) const; + + void set_defaults(enum profile_idc profile, int level_major, int level_minor); + + int video_parameter_set_id; + int vps_max_layers; // [1;?] currently always 1 + int vps_max_sub_layers; // [1;7] number of temporal sub-layers + int vps_temporal_id_nesting_flag; // indicate temporal up-switching always possible + profile_tier_level profile_tier_level_; + + int vps_sub_layer_ordering_info_present_flag; + layer_data layer[MAX_TEMPORAL_SUBLAYERS]; + + uint8_t vps_max_layer_id; // max value for nuh_layer_id in NALs + int vps_num_layer_sets; // [1;1024], currently always 1 + + std::vector > layer_id_included_flag; // max size = [1024][64] + + + // --- timing info --- + + char vps_timing_info_present_flag; + uint32_t vps_num_units_in_tick; + uint32_t vps_time_scale; + char vps_poc_proportional_to_timing_flag; + uint32_t vps_num_ticks_poc_diff_one; + + int vps_num_hrd_parameters; // currently [0;1] + + std::vector hrd_layer_set_idx; // max size = 1024 + std::vector cprms_present_flag; // max size = 1024 + + + // --- vps extension --- + + char vps_extension_flag; +}; + + +#endif diff --git a/libde265/vui.h b/libde265/vui.h new file mode 100644 index 0000000..c412669 --- /dev/null +++ b/libde265/vui.h @@ -0,0 +1,126 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef DE265_VUI_H +#define DE265_VUI_H + +#include "libde265/de265.h" +#include "libde265/bitstream.h" + +#include + +class error_queue; +class seq_parameter_set; + + +enum VideoFormat { + VideoFormat_Component = 0, + VideoFormat_PAL = 1, + VideoFormat_NTSC = 2, + VideoFormat_SECAM = 3, + VideoFormat_MAC = 4, + VideoFormat_Unspecified = 5 +}; + +const char* get_video_format_name(enum VideoFormat); + + +class video_usability_information +{ + public: + video_usability_information(); + + de265_error read(error_queue*, bitreader*, const seq_parameter_set*); + void dump(int fd) const; + + + // --- sample aspect ratio (SAR) --- + + bool aspect_ratio_info_present_flag; + uint16_t sar_width; // sar_width and sar_height are zero if unspecified + uint16_t sar_height; + + + // --- overscan --- + + bool overscan_info_present_flag; + bool overscan_appropriate_flag; + + + // --- video signal type --- + + bool video_signal_type_present_flag; + enum VideoFormat video_format; + bool video_full_range_flag; + bool colour_description_present_flag; + uint8_t colour_primaries; + uint8_t transfer_characteristics; + uint8_t matrix_coeffs; + + // --- chroma / interlaced --- + + bool chroma_loc_info_present_flag; + uint8_t chroma_sample_loc_type_top_field; + uint8_t chroma_sample_loc_type_bottom_field; + + bool neutral_chroma_indication_flag; + bool field_seq_flag; + bool frame_field_info_present_flag; + + // --- default display window --- + + bool default_display_window_flag; + uint32_t def_disp_win_left_offset; + uint32_t def_disp_win_right_offset; + uint32_t def_disp_win_top_offset; + uint32_t def_disp_win_bottom_offset; + + + // --- timing --- + + bool vui_timing_info_present_flag; + uint32_t vui_num_units_in_tick; + uint32_t vui_time_scale; + + bool vui_poc_proportional_to_timing_flag; + uint32_t vui_num_ticks_poc_diff_one; + + + // --- hrd parameters --- + + bool vui_hrd_parameters_present_flag; + //hrd_parameters vui_hrd_parameters; + + + // --- bitstream restriction --- + + bool bitstream_restriction_flag; + bool tiles_fixed_structure_flag; + bool motion_vectors_over_pic_boundaries_flag; + bool restricted_ref_pic_lists_flag; + uint16_t min_spatial_segmentation_idc; + uint8_t max_bytes_per_pic_denom; + uint8_t max_bits_per_min_cu_denom; + uint8_t log2_max_mv_length_horizontal; + uint8_t log2_max_mv_length_vertical; +}; + + +#endif diff --git a/md5.cc b/md5.cc new file mode 100644 index 0000000..2f01c93 --- /dev/null +++ b/md5.cc @@ -0,0 +1,295 @@ +/* + * This is an OpenSSL-compatible implementation of the RSA Data Security, Inc. + * MD5 Message-Digest Algorithm (RFC 1321). + * + * Homepage: + * http://openwall.info/wiki/people/solar/software/public-domain-source-code/md5 + * + * Author: + * Alexander Peslyak, better known as Solar Designer + * + * This software was written by Alexander Peslyak in 2001. No copyright is + * claimed, and the software is hereby placed in the public domain. + * In case this attempt to disclaim copyright and place the software in the + * public domain is deemed null and void, then the software is + * Copyright (c) 2001 Alexander Peslyak and it is hereby released to the + * general public under the following terms: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted. + * + * There's ABSOLUTELY NO WARRANTY, express or implied. + * + * (This is a heavily cut-down "BSD license".) + * + * This differs from Colin Plumb's older public domain implementation in that + * no exactly 32-bit integer data type is required (any 32-bit or wider + * unsigned integer data type will do), there's no compile-time endianness + * configuration, and the function prototypes match OpenSSL's. No code from + * Colin Plumb's implementation has been reused; this comment merely compares + * the properties of the two independent implementations. + * + * The primary goals of this implementation are portability and ease of use. + * It is meant to be fast, but not as fast as possible. Some known + * optimizations are not included to reduce source code size and avoid + * compile-time configuration. + */ + +#ifndef HAVE_OPENSSL + +#include + +#include "md5.h" + +/* + * The basic MD5 functions. + * + * F and G are optimized compared to their RFC 1321 definitions for + * architectures that lack an AND-NOT instruction, just like in Colin Plumb's + * implementation. + */ +#define F(x, y, z) ((z) ^ ((x) & ((y) ^ (z)))) +#define G(x, y, z) ((y) ^ ((z) & ((x) ^ (y)))) +#define H(x, y, z) ((x) ^ (y) ^ (z)) +#define I(x, y, z) ((y) ^ ((x) | ~(z))) + +/* + * The MD5 transformation for all four rounds. + */ +#define STEP(f, a, b, c, d, x, t, s) \ + (a) += f((b), (c), (d)) + (x) + (t); \ + (a) = (((a) << (s)) | (((a) & 0xffffffff) >> (32 - (s)))); \ + (a) += (b); + +/* + * SET reads 4 input bytes in little-endian byte order and stores them + * in a properly aligned word in host byte order. + * + * The check for little-endian architectures that tolerate unaligned + * memory accesses is just an optimization. Nothing will break if it + * doesn't work. + */ +#if defined(__i386__) || defined(__x86_64__) || defined(__vax__) +#define SET(n) \ + (*(MD5_u32plus *)&ptr[(n) * 4]) +#define GET(n) \ + SET(n) +#else +#define SET(n) \ + (ctx->block[(n)] = \ + (MD5_u32plus)ptr[(n) * 4] | \ + ((MD5_u32plus)ptr[(n) * 4 + 1] << 8) | \ + ((MD5_u32plus)ptr[(n) * 4 + 2] << 16) | \ + ((MD5_u32plus)ptr[(n) * 4 + 3] << 24)) +#define GET(n) \ + (ctx->block[(n)]) +#endif + +/* + * This processes one or more 64-byte data blocks, but does NOT update + * the bit counters. There are no alignment requirements. + */ +static void *body(MD5_CTX *ctx, void *data, unsigned long size) +{ + unsigned char *ptr; + MD5_u32plus a, b, c, d; + MD5_u32plus saved_a, saved_b, saved_c, saved_d; + + ptr = (unsigned char *)data; + + a = ctx->a; + b = ctx->b; + c = ctx->c; + d = ctx->d; + + do { + saved_a = a; + saved_b = b; + saved_c = c; + saved_d = d; + +/* Round 1 */ + STEP(F, a, b, c, d, SET(0), 0xd76aa478, 7) + STEP(F, d, a, b, c, SET(1), 0xe8c7b756, 12) + STEP(F, c, d, a, b, SET(2), 0x242070db, 17) + STEP(F, b, c, d, a, SET(3), 0xc1bdceee, 22) + STEP(F, a, b, c, d, SET(4), 0xf57c0faf, 7) + STEP(F, d, a, b, c, SET(5), 0x4787c62a, 12) + STEP(F, c, d, a, b, SET(6), 0xa8304613, 17) + STEP(F, b, c, d, a, SET(7), 0xfd469501, 22) + STEP(F, a, b, c, d, SET(8), 0x698098d8, 7) + STEP(F, d, a, b, c, SET(9), 0x8b44f7af, 12) + STEP(F, c, d, a, b, SET(10), 0xffff5bb1, 17) + STEP(F, b, c, d, a, SET(11), 0x895cd7be, 22) + STEP(F, a, b, c, d, SET(12), 0x6b901122, 7) + STEP(F, d, a, b, c, SET(13), 0xfd987193, 12) + STEP(F, c, d, a, b, SET(14), 0xa679438e, 17) + STEP(F, b, c, d, a, SET(15), 0x49b40821, 22) + +/* Round 2 */ + STEP(G, a, b, c, d, GET(1), 0xf61e2562, 5) + STEP(G, d, a, b, c, GET(6), 0xc040b340, 9) + STEP(G, c, d, a, b, GET(11), 0x265e5a51, 14) + STEP(G, b, c, d, a, GET(0), 0xe9b6c7aa, 20) + STEP(G, a, b, c, d, GET(5), 0xd62f105d, 5) + STEP(G, d, a, b, c, GET(10), 0x02441453, 9) + STEP(G, c, d, a, b, GET(15), 0xd8a1e681, 14) + STEP(G, b, c, d, a, GET(4), 0xe7d3fbc8, 20) + STEP(G, a, b, c, d, GET(9), 0x21e1cde6, 5) + STEP(G, d, a, b, c, GET(14), 0xc33707d6, 9) + STEP(G, c, d, a, b, GET(3), 0xf4d50d87, 14) + STEP(G, b, c, d, a, GET(8), 0x455a14ed, 20) + STEP(G, a, b, c, d, GET(13), 0xa9e3e905, 5) + STEP(G, d, a, b, c, GET(2), 0xfcefa3f8, 9) + STEP(G, c, d, a, b, GET(7), 0x676f02d9, 14) + STEP(G, b, c, d, a, GET(12), 0x8d2a4c8a, 20) + +/* Round 3 */ + STEP(H, a, b, c, d, GET(5), 0xfffa3942, 4) + STEP(H, d, a, b, c, GET(8), 0x8771f681, 11) + STEP(H, c, d, a, b, GET(11), 0x6d9d6122, 16) + STEP(H, b, c, d, a, GET(14), 0xfde5380c, 23) + STEP(H, a, b, c, d, GET(1), 0xa4beea44, 4) + STEP(H, d, a, b, c, GET(4), 0x4bdecfa9, 11) + STEP(H, c, d, a, b, GET(7), 0xf6bb4b60, 16) + STEP(H, b, c, d, a, GET(10), 0xbebfbc70, 23) + STEP(H, a, b, c, d, GET(13), 0x289b7ec6, 4) + STEP(H, d, a, b, c, GET(0), 0xeaa127fa, 11) + STEP(H, c, d, a, b, GET(3), 0xd4ef3085, 16) + STEP(H, b, c, d, a, GET(6), 0x04881d05, 23) + STEP(H, a, b, c, d, GET(9), 0xd9d4d039, 4) + STEP(H, d, a, b, c, GET(12), 0xe6db99e5, 11) + STEP(H, c, d, a, b, GET(15), 0x1fa27cf8, 16) + STEP(H, b, c, d, a, GET(2), 0xc4ac5665, 23) + +/* Round 4 */ + STEP(I, a, b, c, d, GET(0), 0xf4292244, 6) + STEP(I, d, a, b, c, GET(7), 0x432aff97, 10) + STEP(I, c, d, a, b, GET(14), 0xab9423a7, 15) + STEP(I, b, c, d, a, GET(5), 0xfc93a039, 21) + STEP(I, a, b, c, d, GET(12), 0x655b59c3, 6) + STEP(I, d, a, b, c, GET(3), 0x8f0ccc92, 10) + STEP(I, c, d, a, b, GET(10), 0xffeff47d, 15) + STEP(I, b, c, d, a, GET(1), 0x85845dd1, 21) + STEP(I, a, b, c, d, GET(8), 0x6fa87e4f, 6) + STEP(I, d, a, b, c, GET(15), 0xfe2ce6e0, 10) + STEP(I, c, d, a, b, GET(6), 0xa3014314, 15) + STEP(I, b, c, d, a, GET(13), 0x4e0811a1, 21) + STEP(I, a, b, c, d, GET(4), 0xf7537e82, 6) + STEP(I, d, a, b, c, GET(11), 0xbd3af235, 10) + STEP(I, c, d, a, b, GET(2), 0x2ad7d2bb, 15) + STEP(I, b, c, d, a, GET(9), 0xeb86d391, 21) + + a += saved_a; + b += saved_b; + c += saved_c; + d += saved_d; + + ptr += 64; + } while (size -= 64); + + ctx->a = a; + ctx->b = b; + ctx->c = c; + ctx->d = d; + + return ptr; +} + +void MD5_Init(MD5_CTX *ctx) +{ + ctx->a = 0x67452301; + ctx->b = 0xefcdab89; + ctx->c = 0x98badcfe; + ctx->d = 0x10325476; + + ctx->lo = 0; + ctx->hi = 0; +} + +void MD5_Update(MD5_CTX *ctx, void *data, unsigned long size) +{ + MD5_u32plus saved_lo; + unsigned long used, free; + + saved_lo = ctx->lo; + if ((ctx->lo = (saved_lo + size) & 0x1fffffff) < saved_lo) + ctx->hi++; + ctx->hi += size >> 29; + + used = saved_lo & 0x3f; + + if (used) { + free = 64 - used; + + if (size < free) { + memcpy(&ctx->buffer[used], data, size); + return; + } + + memcpy(&ctx->buffer[used], data, free); + data = (unsigned char *)data + free; + size -= free; + body(ctx, ctx->buffer, 64); + } + + if (size >= 64) { + data = body(ctx, data, size & ~(unsigned long)0x3f); + size &= 0x3f; + } + + memcpy(ctx->buffer, data, size); +} + +void MD5_Final(unsigned char *result, MD5_CTX *ctx) +{ + unsigned long used, free; + + used = ctx->lo & 0x3f; + + ctx->buffer[used++] = 0x80; + + free = 64 - used; + + if (free < 8) { + memset(&ctx->buffer[used], 0, free); + body(ctx, ctx->buffer, 64); + used = 0; + free = 64; + } + + memset(&ctx->buffer[used], 0, free - 8); + + ctx->lo <<= 3; + ctx->buffer[56] = ctx->lo; + ctx->buffer[57] = ctx->lo >> 8; + ctx->buffer[58] = ctx->lo >> 16; + ctx->buffer[59] = ctx->lo >> 24; + ctx->buffer[60] = ctx->hi; + ctx->buffer[61] = ctx->hi >> 8; + ctx->buffer[62] = ctx->hi >> 16; + ctx->buffer[63] = ctx->hi >> 24; + + body(ctx, ctx->buffer, 64); + + result[0] = ctx->a; + result[1] = ctx->a >> 8; + result[2] = ctx->a >> 16; + result[3] = ctx->a >> 24; + result[4] = ctx->b; + result[5] = ctx->b >> 8; + result[6] = ctx->b >> 16; + result[7] = ctx->b >> 24; + result[8] = ctx->c; + result[9] = ctx->c >> 8; + result[10] = ctx->c >> 16; + result[11] = ctx->c >> 24; + result[12] = ctx->d; + result[13] = ctx->d >> 8; + result[14] = ctx->d >> 16; + result[15] = ctx->d >> 24; + + memset(ctx, 0, sizeof(*ctx)); +} + +#endif diff --git a/motion.cc b/motion.cc new file mode 100644 index 0000000..deae240 --- /dev/null +++ b/motion.cc @@ -0,0 +1,2111 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "motion.h" +#include "decctx.h" +#include "util.h" +#include "dpb.h" + +#include + + +#include +#include +#include + +#if defined(_MSC_VER) || defined(__MINGW32__) +# include +#elif defined(HAVE_ALLOCA_H) +# include +#endif + + +#define MAX_CU_SIZE 64 + + +static int extra_before[4] = { 0,3,3,2 }; +static int extra_after [4] = { 0,3,4,4 }; + + + +template +void mc_luma(const base_context* ctx, + const seq_parameter_set* sps, int mv_x, int mv_y, + int xP,int yP, + int16_t* out, int out_stride, + const pixel_t* ref, int ref_stride, + int nPbW, int nPbH, int bitDepth_L) +{ + int xFracL = mv_x & 3; + int yFracL = mv_y & 3; + + int xIntOffsL = xP + (mv_x>>2); + int yIntOffsL = yP + (mv_y>>2); + + // luma sample interpolation process (8.5.3.2.2.1) + + //const int shift1 = sps->BitDepth_Y-8; + //const int shift2 = 6; + const int shift3 = 14 - sps->BitDepth_Y; + + int w = sps->pic_width_in_luma_samples; + int h = sps->pic_height_in_luma_samples; + + ALIGNED_16(int16_t) mcbuffer[MAX_CU_SIZE * (MAX_CU_SIZE+7)]; + + if (xFracL==0 && yFracL==0) { + + if (xIntOffsL >= 0 && yIntOffsL >= 0 && + nPbW+xIntOffsL <= w && nPbH+yIntOffsL <= h) { + + ctx->acceleration.put_hevc_qpel(out, out_stride, + &ref[yIntOffsL*ref_stride + xIntOffsL], + ref_stride /* sizeof(pixel_t)*/, + nPbW,nPbH, mcbuffer, 0,0, bitDepth_L); + } + else { + for (int y=0;y \n"); + + for (int y=0;y> 6); // 6 will be used when summing predictions + } + logtrace(LogMotion,"\n"); + } +#endif + } + else { + int extra_left = extra_before[xFracL]; + int extra_right = extra_after [xFracL]; + int extra_top = extra_before[yFracL]; + int extra_bottom = extra_after [yFracL]; + + //int nPbW_extra = extra_left + nPbW + extra_right; + //int nPbH_extra = extra_top + nPbH + extra_bottom; + + + pixel_t padbuf[(MAX_CU_SIZE+16)*(MAX_CU_SIZE+7)]; + + const pixel_t* src_ptr; + int src_stride; + + if (-extra_left + xIntOffsL >= 0 && + -extra_top + yIntOffsL >= 0 && + nPbW+extra_right + xIntOffsL < w && + nPbH+extra_bottom + yIntOffsL < h) { + src_ptr = &ref[xIntOffsL + yIntOffsL*ref_stride]; + src_stride = ref_stride; + } + else { + for (int y=-extra_top;yacceleration.put_hevc_qpel(out, out_stride, + src_ptr, src_stride /* sizeof(pixel_t) */, + nPbW,nPbH, mcbuffer, xFracL,yFracL, bitDepth_L); + + + logtrace(LogMotion,"---V---\n"); + for (int y=0;y +void mc_chroma(const base_context* ctx, + const seq_parameter_set* sps, + int mv_x, int mv_y, + int xP,int yP, + int16_t* out, int out_stride, + const pixel_t* ref, int ref_stride, + int nPbWC, int nPbHC, int bit_depth_C) +{ + // chroma sample interpolation process (8.5.3.2.2.2) + + //const int shift1 = sps->BitDepth_C-8; + //const int shift2 = 6; + const int shift3 = 14 - sps->BitDepth_C; + + int wC = sps->pic_width_in_luma_samples /sps->SubWidthC; + int hC = sps->pic_height_in_luma_samples/sps->SubHeightC; + + mv_x *= 2 / sps->SubWidthC; + mv_y *= 2 / sps->SubHeightC; + + int xFracC = mv_x & 7; + int yFracC = mv_y & 7; + + int xIntOffsC = xP/sps->SubWidthC + (mv_x>>3); + int yIntOffsC = yP/sps->SubHeightC + (mv_y>>3); + + ALIGNED_32(int16_t mcbuffer[MAX_CU_SIZE*(MAX_CU_SIZE+7)]); + + if (xFracC == 0 && yFracC == 0) { + if (xIntOffsC>=0 && nPbWC+xIntOffsC<=wC && + yIntOffsC>=0 && nPbHC+yIntOffsC<=hC) { + ctx->acceleration.put_hevc_epel(out, out_stride, + &ref[xIntOffsC + yIntOffsC*ref_stride], ref_stride, + nPbWC,nPbHC, 0,0, NULL, bit_depth_C); + } + else + { + for (int y=0;y=1 && nPbWC+xIntOffsC<=wC-2 && + yIntOffsC>=1 && nPbHC+yIntOffsC<=hC-2) { + src_ptr = &ref[xIntOffsC + yIntOffsC*ref_stride]; + src_stride = ref_stride; + } + else { + for (int y=-extra_top;yacceleration.put_hevc_epel_hv(out, out_stride, + src_ptr, src_stride, + nPbWC,nPbHC, xFracC,yFracC, mcbuffer, bit_depth_C); + } + else if (xFracC) { + ctx->acceleration.put_hevc_epel_h(out, out_stride, + src_ptr, src_stride, + nPbWC,nPbHC, xFracC,yFracC, mcbuffer, bit_depth_C); + } + else if (yFracC) { + ctx->acceleration.put_hevc_epel_v(out, out_stride, + src_ptr, src_stride, + nPbWC,nPbHC, xFracC,yFracC, mcbuffer, bit_depth_C); + } + else { + assert(false); // full-pel shifts are handled above + } + } +} + + + +// 8.5.3.2 +// NOTE: for full-pel shifts, we can introduce a fast path, simply copying without shifts +void generate_inter_prediction_samples(base_context* ctx, + const slice_segment_header* shdr, + de265_image* img, + int xC,int yC, + int xB,int yB, + int nCS, int nPbW,int nPbH, + const PBMotion* vi) +{ + int xP = xC+xB; + int yP = yC+yB; + + void* pixels[3]; + int stride[3]; + + const pic_parameter_set* pps = shdr->pps.get(); + const seq_parameter_set* sps = pps->sps.get(); + + const int SubWidthC = sps->SubWidthC; + const int SubHeightC = sps->SubHeightC; + + pixels[0] = img->get_image_plane_at_pos_any_depth(0,xP,yP); + stride[0] = img->get_image_stride(0); + + pixels[1] = img->get_image_plane_at_pos_any_depth(1,xP/SubWidthC,yP/SubHeightC); + stride[1] = img->get_image_stride(1); + + pixels[2] = img->get_image_plane_at_pos_any_depth(2,xP/SubWidthC,yP/SubHeightC); + stride[2] = img->get_image_stride(2); + + + ALIGNED_16(int16_t) predSamplesL [2 /* LX */][MAX_CU_SIZE* MAX_CU_SIZE]; + ALIGNED_16(int16_t) predSamplesC[2 /* chroma */ ][2 /* LX */][MAX_CU_SIZE* MAX_CU_SIZE]; + + //int xP = xC+xB; + //int yP = yC+yB; + + int predFlag[2]; + predFlag[0] = vi->predFlag[0]; + predFlag[1] = vi->predFlag[1]; + + const int bit_depth_L = sps->BitDepth_Y; + const int bit_depth_C = sps->BitDepth_C; + + // Some encoders use bi-prediction with two similar MVs. + // Identify this case and use only one MV. + + // do this only without weighted prediction, because the weights/offsets may be different + if (pps->weighted_pred_flag==0) { + if (predFlag[0] && predFlag[1]) { + if (vi->mv[0].x == vi->mv[1].x && + vi->mv[0].y == vi->mv[1].y && + shdr->RefPicList[0][vi->refIdx[0]] == + shdr->RefPicList[1][vi->refIdx[1]]) { + predFlag[1] = 0; + } + } + } + + + for (int l=0;l<2;l++) { + if (predFlag[l]) { + // 8.5.3.2.1 + + if (vi->refIdx[l] >= MAX_NUM_REF_PICS) { + img->integrity = INTEGRITY_DECODING_ERRORS; + ctx->add_warning(DE265_WARNING_NONEXISTING_REFERENCE_PICTURE_ACCESSED, false); + return; + } + + const de265_image* refPic = ctx->get_image(shdr->RefPicList[l][vi->refIdx[l]]); + + logtrace(LogMotion, "refIdx: %d -> dpb[%d]\n", vi->refIdx[l], shdr->RefPicList[l][vi->refIdx[l]]); + + if (!refPic || refPic->PicState == UnusedForReference) { + img->integrity = INTEGRITY_DECODING_ERRORS; + ctx->add_warning(DE265_WARNING_NONEXISTING_REFERENCE_PICTURE_ACCESSED, false); + + // TODO: fill predSamplesC with black or grey + } + else { + // 8.5.3.2.2 + + logtrace(LogMotion,"do MC: L%d,MV=%d;%d RefPOC=%d\n", + l,vi->mv[l].x,vi->mv[l].y,refPic->PicOrderCntVal); + + + // TODO: must predSamples stride really be nCS or can it be somthing smaller like nPbW? + + if (img->high_bit_depth(0)) { + mc_luma(ctx, sps, vi->mv[l].x, vi->mv[l].y, xP,yP, + predSamplesL[l],nCS, + (const uint16_t*)refPic->get_image_plane(0), + refPic->get_luma_stride(), nPbW,nPbH, bit_depth_L); + } + else { + mc_luma(ctx, sps, vi->mv[l].x, vi->mv[l].y, xP,yP, + predSamplesL[l],nCS, + (const uint8_t*)refPic->get_image_plane(0), + refPic->get_luma_stride(), nPbW,nPbH, bit_depth_L); + } + + if (img->high_bit_depth(0)) { + mc_chroma(ctx, sps, vi->mv[l].x, vi->mv[l].y, xP,yP, + predSamplesC[0][l],nCS, (const uint16_t*)refPic->get_image_plane(1), + refPic->get_chroma_stride(), nPbW/SubWidthC,nPbH/SubHeightC, bit_depth_C); + mc_chroma(ctx, sps, vi->mv[l].x, vi->mv[l].y, xP,yP, + predSamplesC[1][l],nCS, (const uint16_t*)refPic->get_image_plane(2), + refPic->get_chroma_stride(), nPbW/SubWidthC,nPbH/SubHeightC, bit_depth_C); + } + else { + mc_chroma(ctx, sps, vi->mv[l].x, vi->mv[l].y, xP,yP, + predSamplesC[0][l],nCS, (const uint8_t*)refPic->get_image_plane(1), + refPic->get_chroma_stride(), nPbW/SubWidthC,nPbH/SubHeightC, bit_depth_C); + mc_chroma(ctx, sps, vi->mv[l].x, vi->mv[l].y, xP,yP, + predSamplesC[1][l],nCS, (const uint8_t*)refPic->get_image_plane(2), + refPic->get_chroma_stride(), nPbW/SubWidthC,nPbH/SubHeightC, bit_depth_C); + } + } + } + } + + + // weighted sample prediction (8.5.3.2.3) + + const int shift1_L = libde265_max(2,14-sps->BitDepth_Y); + const int offset_shift1_L = img->get_sps().WpOffsetBdShiftY; + const int shift1_C = libde265_max(2,14-sps->BitDepth_C); + const int offset_shift1_C = img->get_sps().WpOffsetBdShiftC; + + /* + const int shift1_L = 14-img->sps.BitDepth_Y; + const int offset_shift1_L = img->sps.BitDepth_Y-8; + const int shift1_C = 14-img->sps.BitDepth_C; + const int offset_shift1_C = img->sps.BitDepth_C-8; + */ + + /* + if (0) + printf("%d/%d %d/%d %d/%d %d/%d\n", + shift1_L, + Nshift1_L, + offset_shift1_L, + Noffset_shift1_L, + shift1_C, + Nshift1_C, + offset_shift1_C, + Noffset_shift1_C); + + assert(shift1_L== + Nshift1_L); + assert(offset_shift1_L== + Noffset_shift1_L); + assert(shift1_C== + Nshift1_C); + assert(offset_shift1_C== + Noffset_shift1_C); + */ + + + logtrace(LogMotion,"predFlags (modified): %d %d\n", predFlag[0], predFlag[1]); + + if (shdr->slice_type == SLICE_TYPE_P) { + if (pps->weighted_pred_flag==0) { + if (predFlag[0]==1 && predFlag[1]==0) { + ctx->acceleration.put_unweighted_pred(pixels[0], stride[0], + predSamplesL[0],nCS, nPbW,nPbH, bit_depth_L); + ctx->acceleration.put_unweighted_pred(pixels[1], stride[1], + predSamplesC[0][0],nCS, + nPbW/SubWidthC,nPbH/SubHeightC, bit_depth_C); + ctx->acceleration.put_unweighted_pred(pixels[2], stride[2], + predSamplesC[1][0],nCS, + nPbW/SubWidthC,nPbH/SubHeightC, bit_depth_C); + } + else { + ctx->add_warning(DE265_WARNING_BOTH_PREDFLAGS_ZERO, false); + img->integrity = INTEGRITY_DECODING_ERRORS; + } + } + else { + // weighted prediction + + if (predFlag[0]==1 && predFlag[1]==0) { + + int refIdx0 = vi->refIdx[0]; + + int luma_log2WD = shdr->luma_log2_weight_denom + shift1_L; + int chroma_log2WD = shdr->ChromaLog2WeightDenom + shift1_C; + + int luma_w0 = shdr->LumaWeight[0][refIdx0]; + int luma_o0 = shdr->luma_offset[0][refIdx0] * (1<<(offset_shift1_L)); + + int chroma0_w0 = shdr->ChromaWeight[0][refIdx0][0]; + int chroma0_o0 = shdr->ChromaOffset[0][refIdx0][0] * (1<<(offset_shift1_C)); + int chroma1_w0 = shdr->ChromaWeight[0][refIdx0][1]; + int chroma1_o0 = shdr->ChromaOffset[0][refIdx0][1] * (1<<(offset_shift1_C)); + + logtrace(LogMotion,"weighted-0 [%d] %d %d %d %dx%d\n", refIdx0, luma_log2WD-6,luma_w0,luma_o0,nPbW,nPbH); + + ctx->acceleration.put_weighted_pred(pixels[0], stride[0], + predSamplesL[0],nCS, nPbW,nPbH, + luma_w0, luma_o0, luma_log2WD, bit_depth_L); + ctx->acceleration.put_weighted_pred(pixels[1], stride[1], + predSamplesC[0][0],nCS, nPbW/SubWidthC,nPbH/SubHeightC, + chroma0_w0, chroma0_o0, chroma_log2WD, bit_depth_C); + ctx->acceleration.put_weighted_pred(pixels[2], stride[2], + predSamplesC[1][0],nCS, nPbW/SubWidthC,nPbH/SubHeightC, + chroma1_w0, chroma1_o0, chroma_log2WD, bit_depth_C); + } + else { + ctx->add_warning(DE265_WARNING_BOTH_PREDFLAGS_ZERO, false); + img->integrity = INTEGRITY_DECODING_ERRORS; + } + } + } + else { + assert(shdr->slice_type == SLICE_TYPE_B); + + if (predFlag[0]==1 && predFlag[1]==1) { + if (pps->weighted_bipred_flag==0) { + //const int shift2 = 15-8; // TODO: real bit depth + //const int offset2 = 1<<(shift2-1); + + int16_t* in0 = predSamplesL[0]; + int16_t* in1 = predSamplesL[1]; + + ctx->acceleration.put_weighted_pred_avg(pixels[0], stride[0], + in0,in1, nCS, nPbW, nPbH, bit_depth_L); + + int16_t* in00 = predSamplesC[0][0]; + int16_t* in01 = predSamplesC[0][1]; + int16_t* in10 = predSamplesC[1][0]; + int16_t* in11 = predSamplesC[1][1]; + + ctx->acceleration.put_weighted_pred_avg(pixels[1], stride[1], + in00,in01, nCS, + nPbW/SubWidthC, nPbH/SubHeightC, bit_depth_C); + ctx->acceleration.put_weighted_pred_avg(pixels[2], stride[2], + in10,in11, nCS, + nPbW/SubWidthC, nPbH/SubHeightC, bit_depth_C); + } + else { + // weighted prediction + + int refIdx0 = vi->refIdx[0]; + int refIdx1 = vi->refIdx[1]; + + int luma_log2WD = shdr->luma_log2_weight_denom + shift1_L; + int chroma_log2WD = shdr->ChromaLog2WeightDenom + shift1_C; + + int luma_w0 = shdr->LumaWeight[0][refIdx0]; + int luma_o0 = shdr->luma_offset[0][refIdx0] * (1<<(offset_shift1_L)); + int luma_w1 = shdr->LumaWeight[1][refIdx1]; + int luma_o1 = shdr->luma_offset[1][refIdx1] * (1<<(offset_shift1_L)); + + int chroma0_w0 = shdr->ChromaWeight[0][refIdx0][0]; + int chroma0_o0 = shdr->ChromaOffset[0][refIdx0][0] * (1<<(offset_shift1_C)); + int chroma1_w0 = shdr->ChromaWeight[0][refIdx0][1]; + int chroma1_o0 = shdr->ChromaOffset[0][refIdx0][1] * (1<<(offset_shift1_C)); + int chroma0_w1 = shdr->ChromaWeight[1][refIdx1][0]; + int chroma0_o1 = shdr->ChromaOffset[1][refIdx1][0] * (1<<(offset_shift1_C)); + int chroma1_w1 = shdr->ChromaWeight[1][refIdx1][1]; + int chroma1_o1 = shdr->ChromaOffset[1][refIdx1][1] * (1<<(offset_shift1_C)); + + logtrace(LogMotion,"weighted-BI-0 [%d] %d %d %d %dx%d\n", refIdx0, luma_log2WD-6,luma_w0,luma_o0,nPbW,nPbH); + logtrace(LogMotion,"weighted-BI-1 [%d] %d %d %d %dx%d\n", refIdx1, luma_log2WD-6,luma_w1,luma_o1,nPbW,nPbH); + + int16_t* in0 = predSamplesL[0]; + int16_t* in1 = predSamplesL[1]; + + ctx->acceleration.put_weighted_bipred(pixels[0], stride[0], + in0,in1, nCS, nPbW, nPbH, + luma_w0,luma_o0, + luma_w1,luma_o1, + luma_log2WD, bit_depth_L); + + int16_t* in00 = predSamplesC[0][0]; + int16_t* in01 = predSamplesC[0][1]; + int16_t* in10 = predSamplesC[1][0]; + int16_t* in11 = predSamplesC[1][1]; + + ctx->acceleration.put_weighted_bipred(pixels[1], stride[1], + in00,in01, nCS, nPbW/SubWidthC, nPbH/SubHeightC, + chroma0_w0,chroma0_o0, + chroma0_w1,chroma0_o1, + chroma_log2WD, bit_depth_C); + ctx->acceleration.put_weighted_bipred(pixels[2], stride[2], + in10,in11, nCS, nPbW/SubWidthC, nPbH/SubHeightC, + chroma1_w0,chroma1_o0, + chroma1_w1,chroma1_o1, + chroma_log2WD, bit_depth_C); + } + } + else if (predFlag[0]==1 || predFlag[1]==1) { + int l = predFlag[0] ? 0 : 1; + + if (pps->weighted_bipred_flag==0) { + ctx->acceleration.put_unweighted_pred(pixels[0], stride[0], + predSamplesL[l],nCS, nPbW,nPbH, bit_depth_L); + ctx->acceleration.put_unweighted_pred(pixels[1], stride[1], + predSamplesC[0][l],nCS, + nPbW/SubWidthC,nPbH/SubHeightC, bit_depth_C); + ctx->acceleration.put_unweighted_pred(pixels[2], stride[2], + predSamplesC[1][l],nCS, + nPbW/SubWidthC,nPbH/SubHeightC, bit_depth_C); + } + else { + int refIdx = vi->refIdx[l]; + + int luma_log2WD = shdr->luma_log2_weight_denom + shift1_L; + int chroma_log2WD = shdr->ChromaLog2WeightDenom + shift1_C; + + int luma_w = shdr->LumaWeight[l][refIdx]; + int luma_o = shdr->luma_offset[l][refIdx] * (1<<(offset_shift1_L)); + + int chroma0_w = shdr->ChromaWeight[l][refIdx][0]; + int chroma0_o = shdr->ChromaOffset[l][refIdx][0] * (1<<(offset_shift1_C)); + int chroma1_w = shdr->ChromaWeight[l][refIdx][1]; + int chroma1_o = shdr->ChromaOffset[l][refIdx][1] * (1<<(offset_shift1_C)); + + logtrace(LogMotion,"weighted-B-L%d [%d] %d %d %d %dx%d\n", l, refIdx, luma_log2WD-6,luma_w,luma_o,nPbW,nPbH); + + ctx->acceleration.put_weighted_pred(pixels[0], stride[0], + predSamplesL[l],nCS, nPbW,nPbH, + luma_w, luma_o, luma_log2WD, bit_depth_L); + ctx->acceleration.put_weighted_pred(pixels[1], stride[1], + predSamplesC[0][l],nCS, + nPbW/SubWidthC,nPbH/SubHeightC, + chroma0_w, chroma0_o, chroma_log2WD, bit_depth_C); + ctx->acceleration.put_weighted_pred(pixels[2], stride[2], + predSamplesC[1][l],nCS, + nPbW/SubWidthC,nPbH/SubHeightC, + chroma1_w, chroma1_o, chroma_log2WD, bit_depth_C); + } + } + else { + // TODO: check why it can actually happen that both predFlags[] are false. + // For now, we ignore this and continue decoding. + + ctx->add_warning(DE265_WARNING_BOTH_PREDFLAGS_ZERO, false); + img->integrity = INTEGRITY_DECODING_ERRORS; + } + } + +#if defined(DE265_LOG_TRACE) && 0 + logtrace(LogTransform,"MC pixels (luma), position %d %d:\n", xP,yP); + + for (int y=0;yget_PartMode(x,y); } + const PBMotion& get_mv_info(int x,int y) const override { return img->get_mv_info(x,y); } + +private: + const de265_image* img; +}; + + + +/* + +--+ +--+--+ + |B2| |B1|B0| + +--+----------------+--+--+ + | | + | | + | | + | | + | PB | + | | + | | + +--+ | + |A1| | + +--+-------------------+ + |A0| + +--+ +*/ + + +// 8.5.3.1.2 +// TODO: check: can we fill the candidate list directly in this function and omit to copy later +/* + xC/yC: CB position + nCS: CB size (probably modified because of singleMCLFlag) + xP/yP: PB position (absolute) (probably modified because of singleMCLFlag) + singleMCLFlag + nPbW/nPbH: PB size + partIdx + out_cand: merging candidate vectors + + Add these candidates: + - A1 + - B1 (if != A1) + - B0 (if != B1) + - A0 (if != A1) + - B2 (if != A1 and != B1) + + A maximum of 4 candidates are generated. + + Note 1: For a CB splitted into two PBs, it does not make sense to merge the + second part to the parameters of the first part, since then, we could use 2Nx2N + right away. -> Exclude this candidate. +*/ +int derive_spatial_merging_candidates(//const de265_image* img, + const MotionVectorAccess& mvaccess, + const de265_image* img, + int xC, int yC, int nCS, int xP, int yP, + uint8_t singleMCLFlag, + int nPbW, int nPbH, + int partIdx, + PBMotion* out_cand, + int maxCandidates) +{ + const pic_parameter_set* pps = &img->get_pps(); + const int log2_parallel_merge_level = pps->log2_parallel_merge_level; + + enum PartMode PartMode = mvaccess.get_PartMode(xC,yC); + + /* + const int A0 = SpatialMergingCandidates::PRED_A0; + const int A1 = SpatialMergingCandidates::PRED_A1; + const int B0 = SpatialMergingCandidates::PRED_B0; + const int B1 = SpatialMergingCandidates::PRED_B1; + const int B2 = SpatialMergingCandidates::PRED_B2; + */ + + // --- A1 --- + + // a pixel within A1 (bottom right of A1) + int xA1 = xP-1; + int yA1 = yP+nPbH-1; + + bool availableA1; + int idxA1; + + int computed_candidates = 0; + + // check if candidate is in same motion-estimation region (MER) -> discard + if ((xP>>log2_parallel_merge_level) == (xA1>>log2_parallel_merge_level) && + (yP>>log2_parallel_merge_level) == (yA1>>log2_parallel_merge_level)) { + availableA1 = false; + logtrace(LogMotion,"spatial merging candidate A1: below parallel merge level\n"); + } + // redundant candidate? (Note 1) -> discard + else if (// !singleMCLFlag && automatically true when partIdx==1 + partIdx==1 && + (PartMode==PART_Nx2N || + PartMode==PART_nLx2N || + PartMode==PART_nRx2N)) { + availableA1 = false; + logtrace(LogMotion,"spatial merging candidate A1: second part ignore\n"); + } + // MV available in A1 + else { + availableA1 = img->available_pred_blk(xC,yC, nCS, xP,yP, nPbW,nPbH,partIdx, xA1,yA1); + if (!availableA1) logtrace(LogMotion,"spatial merging candidate A1: unavailable\n"); + } + + if (availableA1) { + idxA1 = computed_candidates++; + out_cand[idxA1] = mvaccess.get_mv_info(xA1,yA1); + + logtrace(LogMotion,"spatial merging candidate A1:\n"); + logmvcand(out_cand[idxA1]); + } + + if (computed_candidates>=maxCandidates) return computed_candidates; + + + // --- B1 --- + + int xB1 = xP+nPbW-1; + int yB1 = yP-1; + + bool availableB1; + int idxB1; + + // same MER -> discard + if ((xP>>log2_parallel_merge_level) == (xB1>>log2_parallel_merge_level) && + (yP>>log2_parallel_merge_level) == (yB1>>log2_parallel_merge_level)) { + availableB1 = false; + logtrace(LogMotion,"spatial merging candidate B1: below parallel merge level\n"); + } + // redundant candidate (Note 1) -> discard + else if (// !singleMCLFlag && automatically true when partIdx==1 + partIdx==1 && + (PartMode==PART_2NxN || + PartMode==PART_2NxnU || + PartMode==PART_2NxnD)) { + availableB1 = false; + logtrace(LogMotion,"spatial merging candidate B1: second part ignore\n"); + } + // MV available in B1 + else { + availableB1 = img->available_pred_blk(xC,yC, nCS, xP,yP, nPbW,nPbH,partIdx, xB1,yB1); + if (!availableB1) logtrace(LogMotion,"spatial merging candidate B1: unavailable\n"); + } + + if (availableB1) { + const PBMotion& b1 = img->get_mv_info(xB1,yB1); + + // B1 == A1 -> discard B1 + if (availableA1 && out_cand[idxA1] == b1) { + idxB1 = idxA1; + logtrace(LogMotion,"spatial merging candidate B1: redundant to A1\n"); + } + else { + idxB1 = computed_candidates++; + out_cand[idxB1] = b1; + + logtrace(LogMotion,"spatial merging candidate B1:\n"); + logmvcand(out_cand[idxB1]); + } + } + + if (computed_candidates>=maxCandidates) return computed_candidates; + + + // --- B0 --- + + int xB0 = xP+nPbW; + int yB0 = yP-1; + + bool availableB0; + int idxB0; + + if ((xP>>log2_parallel_merge_level) == (xB0>>log2_parallel_merge_level) && + (yP>>log2_parallel_merge_level) == (yB0>>log2_parallel_merge_level)) { + availableB0 = false; + logtrace(LogMotion,"spatial merging candidate B0: below parallel merge level\n"); + } + else { + availableB0 = img->available_pred_blk(xC,yC, nCS, xP,yP, nPbW,nPbH,partIdx, xB0,yB0); + if (!availableB0) logtrace(LogMotion,"spatial merging candidate B0: unavailable\n"); + } + + if (availableB0) { + const PBMotion& b0 = img->get_mv_info(xB0,yB0); + + // B0 == B1 -> discard B0 + if (availableB1 && out_cand[idxB1]==b0) { + idxB0 = idxB1; + logtrace(LogMotion,"spatial merging candidate B0: redundant to B1\n"); + } + else { + idxB0 = computed_candidates++; + out_cand[idxB0] = b0; + logtrace(LogMotion,"spatial merging candidate B0:\n"); + logmvcand(out_cand[idxB0]); + } + } + + if (computed_candidates>=maxCandidates) return computed_candidates; + + + // --- A0 --- + + int xA0 = xP-1; + int yA0 = yP+nPbH; + + bool availableA0; + int idxA0; + + if ((xP>>log2_parallel_merge_level) == (xA0>>log2_parallel_merge_level) && + (yP>>log2_parallel_merge_level) == (yA0>>log2_parallel_merge_level)) { + availableA0 = false; + logtrace(LogMotion,"spatial merging candidate A0: below parallel merge level\n"); + } + else { + availableA0 = img->available_pred_blk(xC,yC, nCS, xP,yP, nPbW,nPbH,partIdx, xA0,yA0); + if (!availableA0) logtrace(LogMotion,"spatial merging candidate A0: unavailable\n"); + } + + if (availableA0) { + const PBMotion& a0 = img->get_mv_info(xA0,yA0); + + // A0 == A1 -> discard A0 + if (availableA1 && out_cand[idxA1]==a0) { + idxA0 = idxA1; + logtrace(LogMotion,"spatial merging candidate A0: redundant to A1\n"); + } + else { + idxA0 = computed_candidates++; + out_cand[idxA0] = a0; + logtrace(LogMotion,"spatial merging candidate A0:\n"); + logmvcand(out_cand[idxA0]); + } + } + + if (computed_candidates>=maxCandidates) return computed_candidates; + + + // --- B2 --- + + int xB2 = xP-1; + int yB2 = yP-1; + + bool availableB2; + int idxB2; + + // if we already have four candidates, do not consider B2 anymore + if (computed_candidates==4) { + availableB2 = false; + logtrace(LogMotion,"spatial merging candidate B2: ignore\n"); + } + else if ((xP>>log2_parallel_merge_level) == (xB2>>log2_parallel_merge_level) && + (yP>>log2_parallel_merge_level) == (yB2>>log2_parallel_merge_level)) { + availableB2 = false; + logtrace(LogMotion,"spatial merging candidate B2: below parallel merge level\n"); + } + else { + availableB2 = img->available_pred_blk(xC,yC, nCS, xP,yP, nPbW,nPbH,partIdx, xB2,yB2); + if (!availableB2) logtrace(LogMotion,"spatial merging candidate B2: unavailable\n"); + } + + if (availableB2) { + const PBMotion& b2 = img->get_mv_info(xB2,yB2); + + // B2 == B1 -> discard B2 + if (availableB1 && out_cand[idxB1]==b2) { + idxB2 = idxB1; + logtrace(LogMotion,"spatial merging candidate B2: redundant to B1\n"); + } + // B2 == A1 -> discard B2 + else if (availableA1 && out_cand[idxA1]==b2) { + idxB2 = idxA1; + logtrace(LogMotion,"spatial merging candidate B2: redundant to A1\n"); + } + else { + idxB2 = computed_candidates++; + out_cand[idxB2] = b2; + logtrace(LogMotion,"spatial merging candidate B2:\n"); + logmvcand(out_cand[idxB2]); + } + } + + return computed_candidates; +} + + +// 8.5.3.1.4 +void derive_zero_motion_vector_candidates(const slice_segment_header* shdr, + PBMotion* out_mergeCandList, + int* inout_numCurrMergeCand, + int maxCandidates) +{ + logtrace(LogMotion,"derive_zero_motion_vector_candidates\n"); + + int numRefIdx; + + if (shdr->slice_type==SLICE_TYPE_P) { + numRefIdx = shdr->num_ref_idx_l0_active; + } + else { + numRefIdx = libde265_min(shdr->num_ref_idx_l0_active, + shdr->num_ref_idx_l1_active); + } + + + //int numInputMergeCand = *inout_numMergeCand; + int zeroIdx = 0; + + while (*inout_numCurrMergeCand < maxCandidates) { + // 1. + + logtrace(LogMotion,"zeroIdx:%d numRefIdx:%d\n", zeroIdx, numRefIdx); + + PBMotion* newCand = &out_mergeCandList[*inout_numCurrMergeCand]; + + const int refIdx = (zeroIdx < numRefIdx) ? zeroIdx : 0; + + if (shdr->slice_type==SLICE_TYPE_P) { + newCand->refIdx[0] = refIdx; + newCand->refIdx[1] = -1; + newCand->predFlag[0] = 1; + newCand->predFlag[1] = 0; + } + else { + newCand->refIdx[0] = refIdx; + newCand->refIdx[1] = refIdx; + newCand->predFlag[0] = 1; + newCand->predFlag[1] = 1; + } + + newCand->mv[0].x = 0; + newCand->mv[0].y = 0; + newCand->mv[1].x = 0; + newCand->mv[1].y = 0; + + (*inout_numCurrMergeCand)++; + + // 2. + + zeroIdx++; + } +} + + +bool scale_mv(MotionVector* out_mv, MotionVector mv, int colDist, int currDist) +{ + int td = Clip3(-128,127, colDist); + int tb = Clip3(-128,127, currDist); + + if (td==0) { + *out_mv = mv; + return false; + } + else { + int tx = (16384 + (abs_value(td)>>1)) / td; + int distScaleFactor = Clip3(-4096,4095, (tb*tx+32)>>6); + out_mv->x = Clip3(-32768,32767, + Sign(distScaleFactor*mv.x)*((abs_value(distScaleFactor*mv.x)+127)>>8)); + out_mv->y = Clip3(-32768,32767, + Sign(distScaleFactor*mv.y)*((abs_value(distScaleFactor*mv.y)+127)>>8)); + return true; + } +} + + +// (L1003) 8.5.3.2.8 + +void derive_collocated_motion_vectors(base_context* ctx, + de265_image* img, + const slice_segment_header* shdr, + int xP,int yP, + int colPic, + int xColPb,int yColPb, + int refIdxLX, // (always 0 for merge mode) + int X, + MotionVector* out_mvLXCol, + uint8_t* out_availableFlagLXCol) +{ + logtrace(LogMotion,"derive_collocated_motion_vectors %d;%d\n",xP,yP); + + + // get collocated image and the prediction mode at the collocated position + + assert(ctx->has_image(colPic)); + const de265_image* colImg = ctx->get_image(colPic); + + // check for access outside image area + + if (xColPb >= colImg->get_width() || + yColPb >= colImg->get_height()) { + ctx->add_warning(DE265_WARNING_COLLOCATED_MOTION_VECTOR_OUTSIDE_IMAGE_AREA, false); + *out_availableFlagLXCol = 0; + return; + } + + enum PredMode predMode = colImg->get_pred_mode(xColPb,yColPb); + + + // collocated block is Intra -> no collocated MV + + if (predMode == MODE_INTRA) { + out_mvLXCol->x = 0; + out_mvLXCol->y = 0; + *out_availableFlagLXCol = 0; + return; + } + + + logtrace(LogMotion,"colPic:%d (POC=%d) X:%d refIdxLX:%d refpiclist:%d\n", + colPic, + colImg->PicOrderCntVal, + X,refIdxLX,shdr->RefPicList[X][refIdxLX]); + + + // collocated reference image is unavailable -> no collocated MV + + if (colImg->integrity == INTEGRITY_UNAVAILABLE_REFERENCE) { + out_mvLXCol->x = 0; + out_mvLXCol->y = 0; + *out_availableFlagLXCol = 0; + return; + } + + + // get the collocated MV + + const PBMotion& mvi = colImg->get_mv_info(xColPb,yColPb); + int listCol; + int refIdxCol; + MotionVector mvCol; + + logtrace(LogMotion,"read MVI %d;%d:\n",xColPb,yColPb); + logmvcand(mvi); + + + // collocated MV uses only L1 -> use L1 + if (mvi.predFlag[0]==0) { + mvCol = mvi.mv[1]; + refIdxCol = mvi.refIdx[1]; + listCol = 1; + } + // collocated MV uses only L0 -> use L0 + else if (mvi.predFlag[1]==0) { + mvCol = mvi.mv[0]; + refIdxCol = mvi.refIdx[0]; + listCol = 0; + } + // collocated MV uses L0 and L1 + else { + bool allRefFramesBeforeCurrentFrame = true; + + const int currentPOC = img->PicOrderCntVal; + + // all reference POCs earlier than current POC (list 1) + // Test L1 first, because there is a higher change to find a future reference frame. + + for (int rIdx=0; rIdxnum_ref_idx_l1_active && allRefFramesBeforeCurrentFrame; rIdx++) + { + const de265_image* refimg = ctx->get_image(shdr->RefPicList[1][rIdx]); + int refPOC = refimg->PicOrderCntVal; + + if (refPOC > currentPOC) { + allRefFramesBeforeCurrentFrame = false; + } + } + + // all reference POCs earlier than current POC (list 0) + + for (int rIdx=0; rIdxnum_ref_idx_l0_active && allRefFramesBeforeCurrentFrame; rIdx++) + { + const de265_image* refimg = ctx->get_image(shdr->RefPicList[0][rIdx]); + int refPOC = refimg->PicOrderCntVal; + + if (refPOC > currentPOC) { + allRefFramesBeforeCurrentFrame = false; + } + } + + + /* TODO: What is the rationale behind this ??? + + My guess: + when there are images before the current frame (most probably in L0) and images after + the current frame (most probably in L1), we take the reference in the opposite + direction than where the collocated frame is positioned in the hope that the distance + to the current frame will be smaller and thus give a better prediction. + + If all references point into the past, we cannot say much about the temporal order or + L0,L1 and thus take over both parts. + */ + + if (allRefFramesBeforeCurrentFrame) { + mvCol = mvi.mv[X]; + refIdxCol = mvi.refIdx[X]; + listCol = X; + } + else { + int N = shdr->collocated_from_l0_flag; + mvCol = mvi.mv[N]; + refIdxCol = mvi.refIdx[N]; + listCol = N; + } + } + + + + const slice_segment_header* colShdr = colImg->slices[ colImg->get_SliceHeaderIndex(xColPb,yColPb) ]; + + if (shdr->LongTermRefPic[X][refIdxLX] != + colShdr->LongTermRefPic[listCol][refIdxCol]) { + *out_availableFlagLXCol = 0; + out_mvLXCol->x = 0; + out_mvLXCol->y = 0; + } + else { + *out_availableFlagLXCol = 1; + + const bool isLongTerm = shdr->LongTermRefPic[X][refIdxLX]; + + int colDist = colImg->PicOrderCntVal - colShdr->RefPicList_POC[listCol][refIdxCol]; + int currDist = img->PicOrderCntVal - shdr->RefPicList_POC[X][refIdxLX]; + + logtrace(LogMotion,"COLPOCDIFF %d %d [%d %d / %d %d]\n",colDist, currDist, + colImg->PicOrderCntVal, colShdr->RefPicList_POC[listCol][refIdxCol], + img->PicOrderCntVal, shdr->RefPicList_POC[X][refIdxLX] + ); + + if (isLongTerm || colDist == currDist) { + *out_mvLXCol = mvCol; + } + else { + if (!scale_mv(out_mvLXCol, mvCol, colDist, currDist)) { + ctx->add_warning(DE265_WARNING_INCORRECT_MOTION_VECTOR_SCALING, false); + img->integrity = INTEGRITY_DECODING_ERRORS; + } + + logtrace(LogMotion,"scale: %d;%d to %d;%d\n", + mvCol.x,mvCol.y, out_mvLXCol->x,out_mvLXCol->y); + } + } +} + + +// 8.5.3.1.7 +void derive_temporal_luma_vector_prediction(base_context* ctx, + de265_image* img, + const slice_segment_header* shdr, + int xP,int yP, + int nPbW,int nPbH, + int refIdxL, + int X, // which MV (L0/L1) to get + MotionVector* out_mvLXCol, + uint8_t* out_availableFlagLXCol) +{ + // --- no temporal MVP -> exit --- + + if (shdr->slice_temporal_mvp_enabled_flag == 0) { + out_mvLXCol->x = 0; + out_mvLXCol->y = 0; + *out_availableFlagLXCol = 0; + return; + } + + + // --- find collocated reference image --- + + int Log2CtbSizeY = img->get_sps().Log2CtbSizeY; + + int colPic; // TODO: this is the same for the whole slice. We can precompute it. + + if (shdr->slice_type == SLICE_TYPE_B && + shdr->collocated_from_l0_flag == 0) + { + logtrace(LogMotion,"collocated L1 ref_idx=%d\n",shdr->collocated_ref_idx); + + colPic = shdr->RefPicList[1][ shdr->collocated_ref_idx ]; + } + else + { + logtrace(LogMotion,"collocated L0 ref_idx=%d\n",shdr->collocated_ref_idx); + + colPic = shdr->RefPicList[0][ shdr->collocated_ref_idx ]; + } + + + // check whether collocated reference picture exists + + if (!ctx->has_image(colPic)) { + out_mvLXCol->x = 0; + out_mvLXCol->y = 0; + *out_availableFlagLXCol = 0; + + ctx->add_warning(DE265_WARNING_NONEXISTING_REFERENCE_PICTURE_ACCESSED, false); + return; + } + + + // --- get collocated MV either at bottom-right corner or from center of PB --- + + int xColPb,yColPb; + int yColBr = yP + nPbH; // bottom right collocated motion vector position + int xColBr = xP + nPbW; + + /* If neighboring pixel at bottom-right corner is in the same CTB-row and inside the image, + use this (reduced down to 16 pixels resolution) as collocated MV position. + + Note: see 2014, Sze, Sect. 5.2.1.2 why candidate C0 is excluded when on another CTB-row. + This is to reduce the memory bandwidth requirements. + */ + if ((yP>>Log2CtbSizeY) == (yColBr>>Log2CtbSizeY) && + xColBr < img->get_sps().pic_width_in_luma_samples && + yColBr < img->get_sps().pic_height_in_luma_samples) + { + xColPb = xColBr & ~0x0F; // reduce resolution of collocated motion-vectors to 16 pixels grid + yColPb = yColBr & ~0x0F; + + derive_collocated_motion_vectors(ctx,img,shdr, xP,yP, colPic, xColPb,yColPb, refIdxL, X, + out_mvLXCol, out_availableFlagLXCol); + } + else + { + out_mvLXCol->x = 0; + out_mvLXCol->y = 0; + *out_availableFlagLXCol = 0; + } + + + if (*out_availableFlagLXCol==0) { + + int xColCtr = xP+(nPbW>>1); + int yColCtr = yP+(nPbH>>1); + + xColPb = xColCtr & ~0x0F; // reduce resolution of collocated motion-vectors to 16 pixels grid + yColPb = yColCtr & ~0x0F; + + derive_collocated_motion_vectors(ctx,img,shdr, xP,yP, colPic, xColPb,yColPb, refIdxL, X, + out_mvLXCol, out_availableFlagLXCol); + } +} + + +static int table_8_19[2][12] = { + { 0,1,0,2,1,2,0,3,1,3,2,3 }, + { 1,0,2,0,2,1,3,0,3,1,3,2 } + }; + +// 8.5.3.1.3 +/* Note (TODO): during decoding, we know which of the candidates we will select. ++ Hence, we do not really have to generate the other ones... ++ */ +void derive_combined_bipredictive_merging_candidates(const base_context* ctx, + const slice_segment_header* shdr, + PBMotion* inout_mergeCandList, + int* inout_numMergeCand, + int maxCandidates) +{ + if (*inout_numMergeCand>1 && *inout_numMergeCand < maxCandidates) { + int numOrigMergeCand = *inout_numMergeCand; + + int numInputMergeCand = *inout_numMergeCand; + int combIdx = 0; + uint8_t combStop = false; + + while (!combStop) { + int l0CandIdx = table_8_19[0][combIdx]; + int l1CandIdx = table_8_19[1][combIdx]; + + if (l0CandIdx >= numInputMergeCand || + l1CandIdx >= numInputMergeCand) { + assert(false); // bitstream error -> TODO: conceal error + } + + PBMotion& l0Cand = inout_mergeCandList[l0CandIdx]; + PBMotion& l1Cand = inout_mergeCandList[l1CandIdx]; + + logtrace(LogMotion,"add bipredictive merging candidate (combIdx:%d)\n",combIdx); + logtrace(LogMotion,"l0Cand:\n"); logmvcand(l0Cand); + logtrace(LogMotion,"l1Cand:\n"); logmvcand(l1Cand); + + const de265_image* img0 = l0Cand.predFlag[0] ? ctx->get_image(shdr->RefPicList[0][l0Cand.refIdx[0]]) : NULL; + const de265_image* img1 = l1Cand.predFlag[1] ? ctx->get_image(shdr->RefPicList[1][l1Cand.refIdx[1]]) : NULL; + + if (l0Cand.predFlag[0] && !img0) { + return; // TODO error + } + + if (l1Cand.predFlag[1] && !img1) { + return; // TODO error + } + + if (l0Cand.predFlag[0] && l1Cand.predFlag[1] && + (img0->PicOrderCntVal != img1->PicOrderCntVal || + l0Cand.mv[0].x != l1Cand.mv[1].x || + l0Cand.mv[0].y != l1Cand.mv[1].y)) { + PBMotion& p = inout_mergeCandList[ *inout_numMergeCand ]; + p.refIdx[0] = l0Cand.refIdx[0]; + p.refIdx[1] = l1Cand.refIdx[1]; + p.predFlag[0] = l0Cand.predFlag[0]; + p.predFlag[1] = l1Cand.predFlag[1]; + p.mv[0] = l0Cand.mv[0]; + p.mv[1] = l1Cand.mv[1]; + (*inout_numMergeCand)++; + + logtrace(LogMotion,"result:\n"); + logmvcand(p); + } + + combIdx++; + if (combIdx == numOrigMergeCand*(numOrigMergeCand-1) || + *inout_numMergeCand == maxCandidates) { + combStop = true; + } + } + } +} + + +// 8.5.3.1.1 + +void get_merge_candidate_list_without_step_9(base_context* ctx, + const slice_segment_header* shdr, + const MotionVectorAccess& mvaccess, + de265_image* img, + int xC,int yC, int xP,int yP, + int nCS, int nPbW,int nPbH, int partIdx, + int max_merge_idx, + PBMotion* mergeCandList) +{ + + //int xOrigP = xP; + //int yOrigP = yP; + int nOrigPbW = nPbW; + int nOrigPbH = nPbH; + + int singleMCLFlag; // single merge-candidate-list (MCL) flag + + /* Use single MCL for CBs of size 8x8, except when parallel-merge-level is at 4x4. + Without this flag, PBs smaller than 8x8 would not receive as much merging candidates. + Having additional candidates might have these advantages: + - coding MVs for these small PBs is expensive, and + - since the PBs are not far away from a proper (neighboring) merging candidate, + the quality of the candidates will still be good. + */ + singleMCLFlag = (img->get_pps().log2_parallel_merge_level > 2 && nCS==8); + + if (singleMCLFlag) { + xP=xC; + yP=yC; + nPbW=nCS; + nPbH=nCS; + partIdx=0; + } + + int maxCandidates = max_merge_idx+1; + //MotionVectorSpec mergeCandList[5]; + int numMergeCand=0; + + // --- spatial merge candidates + + numMergeCand = derive_spatial_merging_candidates(mvaccess, + img, xC,yC, nCS, xP,yP, singleMCLFlag, + nPbW,nPbH,partIdx, mergeCandList, + maxCandidates); + + // --- collocated merge candidate + if (numMergeCand < maxCandidates) { + int refIdxCol[2] = { 0,0 }; + + MotionVector mvCol[2]; + uint8_t predFlagLCol[2]; + derive_temporal_luma_vector_prediction(ctx,img,shdr, xP,yP,nPbW,nPbH, + refIdxCol[0],0, &mvCol[0], + &predFlagLCol[0]); + + uint8_t availableFlagCol = predFlagLCol[0]; + predFlagLCol[1] = 0; + + if (shdr->slice_type == SLICE_TYPE_B) { + derive_temporal_luma_vector_prediction(ctx,img,shdr, + xP,yP,nPbW,nPbH, refIdxCol[1],1, &mvCol[1], + &predFlagLCol[1]); + availableFlagCol |= predFlagLCol[1]; + } + + + if (availableFlagCol) { + PBMotion* colVec = &mergeCandList[numMergeCand++]; + + colVec->mv[0] = mvCol[0]; + colVec->mv[1] = mvCol[1]; + colVec->predFlag[0] = predFlagLCol[0]; + colVec->predFlag[1] = predFlagLCol[1]; + colVec->refIdx[0] = refIdxCol[0]; + colVec->refIdx[1] = refIdxCol[1]; + } + } + + + // --- bipredictive merge candidates --- + + if (shdr->slice_type == SLICE_TYPE_B) { + derive_combined_bipredictive_merging_candidates(ctx, shdr, + mergeCandList, &numMergeCand, maxCandidates); + } + + + // --- zero-vector merge candidates --- + + derive_zero_motion_vector_candidates(shdr, mergeCandList, &numMergeCand, maxCandidates); + + + logtrace(LogMotion,"mergeCandList:\n"); + for (int i=0;iMaxNumMergeCand;i++) + { + //logtrace(LogMotion, " %d:%s\n", i, i==merge_idx ? " SELECTED":""); + logmvcand(mergeCandList[i]); + } +} + + + +void get_merge_candidate_list(base_context* ctx, + const slice_segment_header* shdr, + de265_image* img, + int xC,int yC, int xP,int yP, + int nCS, int nPbW,int nPbH, int partIdx, + PBMotion* mergeCandList) +{ + int max_merge_idx = 5-shdr->five_minus_max_num_merge_cand -1; + + get_merge_candidate_list_without_step_9(ctx, shdr, + MotionVectorAccess_de265_image(img), img, + xC,yC,xP,yP,nCS,nPbW,nPbH, partIdx, + max_merge_idx, mergeCandList); + + // 9. for encoder: modify all merge candidates + + for (int i=0;i<=max_merge_idx;i++) { + if (mergeCandList[i].predFlag[0] && + mergeCandList[i].predFlag[1] && + nPbW+nPbH==12) + { + mergeCandList[i].refIdx[1] = -1; + mergeCandList[i].predFlag[1] = 0; + } + } +} + + +void derive_luma_motion_merge_mode(base_context* ctx, + const slice_segment_header* shdr, + de265_image* img, + int xC,int yC, int xP,int yP, + int nCS, int nPbW,int nPbH, int partIdx, + int merge_idx, + PBMotion* out_vi) +{ + PBMotion mergeCandList[5]; + + get_merge_candidate_list_without_step_9(ctx, shdr, + MotionVectorAccess_de265_image(img), img, + xC,yC,xP,yP,nCS,nPbW,nPbH, partIdx, + merge_idx, mergeCandList); + + + *out_vi = mergeCandList[merge_idx]; + + // 8.5.3.1.1 / 9. + + if (out_vi->predFlag[0] && out_vi->predFlag[1] && nPbW+nPbH==12) { + out_vi->refIdx[1] = -1; + out_vi->predFlag[1] = 0; + } +} + + +// 8.5.3.1.6 +void derive_spatial_luma_vector_prediction(base_context* ctx, + de265_image* img, + const slice_segment_header* shdr, + int xC,int yC,int nCS,int xP,int yP, + int nPbW,int nPbH, int X, + int refIdxLX, int partIdx, + uint8_t out_availableFlagLXN[2], + MotionVector out_mvLXN[2]) +{ + int isScaledFlagLX = 0; + + const int A=0; + const int B=1; + + out_availableFlagLXN[A] = 0; + out_availableFlagLXN[B] = 0; + + + // --- A --- + + // 1. + + int xA[2], yA[2]; + xA[0] = xP-1; + yA[0] = yP + nPbH; + xA[1] = xA[0]; + yA[1] = yA[0]-1; + + // 2. + + out_availableFlagLXN[A] = 0; + out_mvLXN[A].x = 0; + out_mvLXN[A].y = 0; + + // 3. / 4. + + bool availableA[2]; + availableA[0] = img->available_pred_blk(xC,yC, nCS, xP,yP, nPbW,nPbH,partIdx, xA[0],yA[0]); + availableA[1] = img->available_pred_blk(xC,yC, nCS, xP,yP, nPbW,nPbH,partIdx, xA[1],yA[1]); + + // 5. + + if (availableA[0] || availableA[1]) { + isScaledFlagLX = 1; + } + + // 6. test A0 and A1 (Ak) + + int refIdxA=-1; + + // the POC we want to reference in this PB + const de265_image* tmpimg = ctx->get_image(shdr->RefPicList[X][ refIdxLX ]); + if (tmpimg==NULL) { return; } + const int referenced_POC = tmpimg->PicOrderCntVal; + + for (int k=0;k<=1;k++) { + if (availableA[k] && + out_availableFlagLXN[A]==0 && // no A?-predictor so far + img->get_pred_mode(xA[k],yA[k]) != MODE_INTRA) { + + int Y=1-X; + + const PBMotion& vi = img->get_mv_info(xA[k],yA[k]); + logtrace(LogMotion,"MVP A%d=\n",k); + logmvcand(vi); + + const de265_image* imgX = NULL; + if (vi.predFlag[X]) imgX = ctx->get_image(shdr->RefPicList[X][ vi.refIdx[X] ]); + const de265_image* imgY = NULL; + if (vi.predFlag[Y]) imgY = ctx->get_image(shdr->RefPicList[Y][ vi.refIdx[Y] ]); + + // check whether the predictor X is available and references the same POC + if (vi.predFlag[X] && imgX && imgX->PicOrderCntVal == referenced_POC) { + + logtrace(LogMotion,"take A%d/L%d as A candidate with same POC\n",k,X); + + out_availableFlagLXN[A]=1; + out_mvLXN[A] = vi.mv[X]; + refIdxA = vi.refIdx[X]; + } + // check whether the other predictor (Y) is available and references the same POC + else if (vi.predFlag[Y] && imgY && imgY->PicOrderCntVal == referenced_POC) { + + logtrace(LogMotion,"take A%d/L%d as A candidate with same POC\n",k,Y); + + out_availableFlagLXN[A]=1; + out_mvLXN[A] = vi.mv[Y]; + refIdxA = vi.refIdx[Y]; + } + } + } + + // 7. If there is no predictor referencing the same POC, we take any other reference as + // long as it is the same type of reference (long-term / short-term) + + for (int k=0 ; k<=1 && out_availableFlagLXN[A]==0 ; k++) { + int refPicList=-1; + + if (availableA[k] && + // TODO: we could remove this call by storing the result of the similar computation above + img->get_pred_mode(xA[k],yA[k]) != MODE_INTRA) { + + int Y=1-X; + + const PBMotion& vi = img->get_mv_info(xA[k],yA[k]); + if (vi.predFlag[X]==1 && + shdr->LongTermRefPic[X][refIdxLX] == shdr->LongTermRefPic[X][ vi.refIdx[X] ]) { + + logtrace(LogMotion,"take A%D/L%d as A candidate with different POCs\n",k,X); + + out_availableFlagLXN[A]=1; + out_mvLXN[A] = vi.mv[X]; + refIdxA = vi.refIdx[X]; + refPicList = X; + } + else if (vi.predFlag[Y]==1 && + shdr->LongTermRefPic[X][refIdxLX] == shdr->LongTermRefPic[Y][ vi.refIdx[Y] ]) { + + logtrace(LogMotion,"take A%d/L%d as A candidate with different POCs\n",k,Y); + + out_availableFlagLXN[A]=1; + out_mvLXN[A] = vi.mv[Y]; + refIdxA = vi.refIdx[Y]; + refPicList = Y; + } + } + + if (out_availableFlagLXN[A]==1) { + if (refIdxA<0) { + out_availableFlagLXN[0] = out_availableFlagLXN[1] = false; + return; // error + } + + assert(refIdxA>=0); + assert(refPicList>=0); + + const de265_image* refPicA = ctx->get_image(shdr->RefPicList[refPicList][refIdxA ]); + const de265_image* refPicX = ctx->get_image(shdr->RefPicList[X ][refIdxLX]); + + //int picStateA = shdr->RefPicList_PicState[refPicList][refIdxA ]; + //int picStateX = shdr->RefPicList_PicState[X ][refIdxLX]; + + int isLongTermA = shdr->LongTermRefPic[refPicList][refIdxA ]; + int isLongTermX = shdr->LongTermRefPic[X ][refIdxLX]; + + logtrace(LogMotion,"scale MVP A: A-POC:%d X-POC:%d\n", + refPicA->PicOrderCntVal,refPicX->PicOrderCntVal); + + if (!isLongTermA && !isLongTermX) + /* + if (picStateA == UsedForShortTermReference && + picStateX == UsedForShortTermReference) + */ + { + int distA = img->PicOrderCntVal - refPicA->PicOrderCntVal; + int distX = img->PicOrderCntVal - referenced_POC; + + if (!scale_mv(&out_mvLXN[A], out_mvLXN[A], distA, distX)) { + ctx->add_warning(DE265_WARNING_INCORRECT_MOTION_VECTOR_SCALING, false); + img->integrity = INTEGRITY_DECODING_ERRORS; + } + } + } + } + + + // --- B --- + + // 1. + + int xB[3], yB[3]; + xB[0] = xP+nPbW; + yB[0] = yP-1; + xB[1] = xB[0]-1; + yB[1] = yP-1; + xB[2] = xP-1; + yB[2] = yP-1; + + // 2. + + out_availableFlagLXN[B] = 0; + out_mvLXN[B].x = 0; + out_mvLXN[B].y = 0; + + // 3. test B0,B1,B2 (Bk) + + int refIdxB=-1; + + bool availableB[3]; + for (int k=0;k<3;k++) { + availableB[k] = img->available_pred_blk(xC,yC, nCS, xP,yP, nPbW,nPbH,partIdx, xB[k],yB[k]); + + if (availableB[k] && out_availableFlagLXN[B]==0) { + + int Y=1-X; + + const PBMotion& vi = img->get_mv_info(xB[k],yB[k]); + logtrace(LogMotion,"MVP B%d=\n",k); + logmvcand(vi); + + + const de265_image* imgX = NULL; + if (vi.predFlag[X]) imgX = ctx->get_image(shdr->RefPicList[X][ vi.refIdx[X] ]); + const de265_image* imgY = NULL; + if (vi.predFlag[Y]) imgY = ctx->get_image(shdr->RefPicList[Y][ vi.refIdx[Y] ]); + + if (vi.predFlag[X] && imgX && imgX->PicOrderCntVal == referenced_POC) { + logtrace(LogMotion,"a) take B%d/L%d as B candidate with same POC\n",k,X); + + out_availableFlagLXN[B]=1; + out_mvLXN[B] = vi.mv[X]; + refIdxB = vi.refIdx[X]; + } + else if (vi.predFlag[Y] && imgY && imgY->PicOrderCntVal == referenced_POC) { + logtrace(LogMotion,"b) take B%d/L%d as B candidate with same POC\n",k,Y); + + out_availableFlagLXN[B]=1; + out_mvLXN[B] = vi.mv[Y]; + refIdxB = vi.refIdx[Y]; + } + } + } + + // 4. + + if (isScaledFlagLX==0 && // no A predictor, + out_availableFlagLXN[B]) // but an unscaled B predictor + { + // use unscaled B predictor as A predictor + + logtrace(LogMotion,"copy the same-POC B candidate as additional A candidate\n"); + + out_availableFlagLXN[A]=1; + out_mvLXN[A] = out_mvLXN[B]; + refIdxA = refIdxB; + } + + // 5. + + // If no A predictor, we output the unscaled B as the A predictor (above) + // and also add a scaled B predictor here. + // If there is (probably) an A predictor, no differing-POC B predictor is generated. + if (isScaledFlagLX==0) { + out_availableFlagLXN[B]=0; + + for (int k=0 ; k<=2 && out_availableFlagLXN[B]==0 ; k++) { + int refPicList=-1; + + if (availableB[k]) { + int Y=1-X; + + const PBMotion& vi = img->get_mv_info(xB[k],yB[k]); + + if (vi.predFlag[X]==1 && + shdr->LongTermRefPic[X][refIdxLX] == shdr->LongTermRefPic[X][ vi.refIdx[X] ]) { + out_availableFlagLXN[B]=1; + out_mvLXN[B] = vi.mv[X]; + refIdxB = vi.refIdx[X]; + refPicList = X; + } + else if (vi.predFlag[Y]==1 && + shdr->LongTermRefPic[X][refIdxLX] == shdr->LongTermRefPic[Y][ vi.refIdx[Y] ]) { + out_availableFlagLXN[B]=1; + out_mvLXN[B] = vi.mv[Y]; + refIdxB = vi.refIdx[Y]; + refPicList = Y; + } + } + + if (out_availableFlagLXN[B]==1) { + if (refIdxB<0) { + out_availableFlagLXN[0] = out_availableFlagLXN[1] = false; + return; // error + } + + assert(refPicList>=0); + assert(refIdxB>=0); + + const de265_image* refPicB=ctx->get_image(shdr->RefPicList[refPicList][refIdxB ]); + const de265_image* refPicX=ctx->get_image(shdr->RefPicList[X ][refIdxLX]); + + int isLongTermB = shdr->LongTermRefPic[refPicList][refIdxB ]; + int isLongTermX = shdr->LongTermRefPic[X ][refIdxLX]; + + if (refPicB==NULL || refPicX==NULL) { + img->decctx->add_warning(DE265_WARNING_NONEXISTING_REFERENCE_PICTURE_ACCESSED,false); + img->integrity = INTEGRITY_DECODING_ERRORS; + } + else if (refPicB->PicOrderCntVal != refPicX->PicOrderCntVal && + !isLongTermB && !isLongTermX) { + int distB = img->PicOrderCntVal - refPicB->PicOrderCntVal; + int distX = img->PicOrderCntVal - referenced_POC; + + logtrace(LogMotion,"scale MVP B: B-POC:%d X-POC:%d\n",refPicB->PicOrderCntVal,refPicX->PicOrderCntVal); + + if (!scale_mv(&out_mvLXN[B], out_mvLXN[B], distB, distX)) { + ctx->add_warning(DE265_WARNING_INCORRECT_MOTION_VECTOR_SCALING, false); + img->integrity = INTEGRITY_DECODING_ERRORS; + } + } + } + } + } +} + + +// 8.5.3.1.5 +void fill_luma_motion_vector_predictors(base_context* ctx, + const slice_segment_header* shdr, + de265_image* img, + int xC,int yC,int nCS,int xP,int yP, + int nPbW,int nPbH, int l, + int refIdx, int partIdx, + MotionVector out_mvpList[2]) +{ + // 8.5.3.1.6: derive two spatial vector predictors A (0) and B (1) + + uint8_t availableFlagLXN[2]; + MotionVector mvLXN[2]; + + derive_spatial_luma_vector_prediction(ctx, img, shdr, xC,yC, nCS, xP,yP, + nPbW,nPbH, l, refIdx, partIdx, + availableFlagLXN, mvLXN); + + // 8.5.3.1.7: if we only have one spatial vector or both spatial vectors are the same, + // derive a temporal predictor + + uint8_t availableFlagLXCol; + MotionVector mvLXCol; + + + if (availableFlagLXN[0] && + availableFlagLXN[1] && + (mvLXN[0].x != mvLXN[1].x || mvLXN[0].y != mvLXN[1].y)) { + availableFlagLXCol = 0; + } + else { + derive_temporal_luma_vector_prediction(ctx, img, shdr, + xP,yP, nPbW,nPbH, refIdx,l, + &mvLXCol, &availableFlagLXCol); + } + + + // --- build candidate vector list with exactly two entries --- + + int numMVPCandLX=0; + + // spatial predictor A + + if (availableFlagLXN[0]) + { + out_mvpList[numMVPCandLX++] = mvLXN[0]; + } + + // spatial predictor B (if not same as A) + + if (availableFlagLXN[1] && + (!availableFlagLXN[0] || // in case A in not available, but mvLXA initialized to same as mvLXB + (mvLXN[0].x != mvLXN[1].x || mvLXN[0].y != mvLXN[1].y))) + { + out_mvpList[numMVPCandLX++] = mvLXN[1]; + } + + // temporal predictor + + if (availableFlagLXCol) + { + out_mvpList[numMVPCandLX++] = mvLXCol; + } + + // fill with zero predictors + + while (numMVPCandLX<2) { + out_mvpList[numMVPCandLX].x = 0; + out_mvpList[numMVPCandLX].y = 0; + numMVPCandLX++; + } + + + assert(numMVPCandLX==2); +} + + +MotionVector luma_motion_vector_prediction(base_context* ctx, + const slice_segment_header* shdr, + de265_image* img, + const PBMotionCoding& motion, + int xC,int yC,int nCS,int xP,int yP, + int nPbW,int nPbH, int l, + int refIdx, int partIdx) +{ + MotionVector mvpList[2]; + + fill_luma_motion_vector_predictors(ctx, shdr, img, + xC,yC,nCS,xP,yP, + nPbW, nPbH, l, refIdx, partIdx, + mvpList); + + // select predictor according to mvp_lX_flag + + return mvpList[ l ? motion.mvp_l1_flag : motion.mvp_l0_flag ]; +} + + +#if DE265_LOG_TRACE +void logMV(int x0,int y0,int nPbW,int nPbH, const char* mode,const PBMotion* mv) +{ + int pred0 = mv->predFlag[0]; + int pred1 = mv->predFlag[1]; + + logtrace(LogMotion, + "*MV %d;%d [%d;%d] %s: (%d) %d;%d @%d (%d) %d;%d @%d\n", x0,y0,nPbW,nPbH,mode, + pred0, + pred0 ? mv->mv[0].x : 0,pred0 ? mv->mv[0].y : 0, pred0 ? mv->refIdx[0] : 0, + pred1, + pred1 ? mv->mv[1].x : 0,pred1 ? mv->mv[1].y : 0, pred1 ? mv->refIdx[1] : 0); +} +#else +#define logMV(x0,y0,nPbW,nPbH,mode,mv) +#endif + + + +// 8.5.3.1 +void motion_vectors_and_ref_indices(base_context* ctx, + const slice_segment_header* shdr, + de265_image* img, + const PBMotionCoding& motion, + int xC,int yC, int xB,int yB, int nCS, int nPbW,int nPbH, + int partIdx, + PBMotion* out_vi) +{ + //slice_segment_header* shdr = tctx->shdr; + + int xP = xC+xB; + int yP = yC+yB; + + enum PredMode predMode = img->get_pred_mode(xC,yC); + + if (predMode == MODE_SKIP || + (predMode == MODE_INTER && motion.merge_flag)) + { + derive_luma_motion_merge_mode(ctx,shdr,img, + xC,yC, xP,yP, nCS,nPbW,nPbH, partIdx, + motion.merge_idx, out_vi); + + logMV(xP,yP,nPbW,nPbH, "merge_mode", out_vi); + } + else { + int mvdL[2][2]; + MotionVector mvpL[2]; + + for (int l=0;l<2;l++) { + // 1. + + enum InterPredIdc inter_pred_idc = (enum InterPredIdc)motion.inter_pred_idc; + + if (inter_pred_idc == PRED_BI || + (inter_pred_idc == PRED_L0 && l==0) || + (inter_pred_idc == PRED_L1 && l==1)) { + out_vi->refIdx[l] = motion.refIdx[l]; + out_vi->predFlag[l] = 1; + } + else { + out_vi->refIdx[l] = -1; + out_vi->predFlag[l] = 0; + } + + // 2. + + mvdL[l][0] = motion.mvd[l][0]; + mvdL[l][1] = motion.mvd[l][1]; + + + if (out_vi->predFlag[l]) { + // 3. + + mvpL[l] = luma_motion_vector_prediction(ctx,shdr,img,motion, + xC,yC,nCS,xP,yP, nPbW,nPbH, l, + out_vi->refIdx[l], partIdx); + + // 4. + + int32_t x = (mvpL[l].x + mvdL[l][0] + 0x10000) & 0xFFFF; + int32_t y = (mvpL[l].y + mvdL[l][1] + 0x10000) & 0xFFFF; + + out_vi->mv[l].x = (x>=0x8000) ? x-0x10000 : x; + out_vi->mv[l].y = (y>=0x8000) ? y-0x10000 : y; + } + } + + logMV(xP,yP,nPbW,nPbH, "mvp", out_vi); + } +} + + +// 8.5.3 + +/* xC/yC : CB position + xB/yB : position offset of the PB + nPbW/nPbH : size of PB + nCS : CB size + */ +void decode_prediction_unit(base_context* ctx, + const slice_segment_header* shdr, + de265_image* img, + const PBMotionCoding& motion, + int xC,int yC, int xB,int yB, int nCS, int nPbW,int nPbH, int partIdx) +{ + logtrace(LogMotion,"decode_prediction_unit POC=%d %d;%d %dx%d\n", + img->PicOrderCntVal, xC+xB,yC+yB, nPbW,nPbH); + + //slice_segment_header* shdr = tctx->shdr; + + // 1. + + PBMotion vi; + motion_vectors_and_ref_indices(ctx, shdr, img, motion, + xC,yC, xB,yB, nCS, nPbW,nPbH, partIdx, &vi); + + // 2. + + generate_inter_prediction_samples(ctx,shdr, img, xC,yC, xB,yB, nCS, nPbW,nPbH, &vi); + + + img->set_mv_info(xC+xB,yC+yB,nPbW,nPbH, vi); +} diff --git a/nal-parser.cc b/nal-parser.cc new file mode 100644 index 0000000..ea95ed1 --- /dev/null +++ b/nal-parser.cc @@ -0,0 +1,446 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "nal-parser.h" + +#include +#include +#include +#include + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + + +NAL_unit::NAL_unit() + : skipped_bytes(DE265_SKIPPED_BYTES_INITIAL_SIZE) +{ + pts=0; + user_data = NULL; + + nal_data = NULL; + data_size = 0; + capacity = 0; +} + +NAL_unit::~NAL_unit() +{ + free(nal_data); +} + +void NAL_unit::clear() +{ + header = nal_header(); + pts = 0; + user_data = NULL; + + // set size to zero but keep memory + data_size = 0; + + skipped_bytes.clear(); +} + +LIBDE265_CHECK_RESULT bool NAL_unit::resize(int new_size) +{ + if (capacity < new_size) { + unsigned char* newbuffer = (unsigned char*)malloc(new_size); + if (newbuffer == NULL) { + return false; + } + + if (nal_data != NULL) { + memcpy(newbuffer, nal_data, data_size); + free(nal_data); + } + + nal_data = newbuffer; + capacity = new_size; + } + return true; +} + +LIBDE265_CHECK_RESULT bool NAL_unit::append(const unsigned char* in_data, int n) +{ + if (!resize(data_size + n)) { + return false; + } + memcpy(nal_data + data_size, in_data, n); + data_size += n; + return true; +} + +bool LIBDE265_CHECK_RESULT NAL_unit::set_data(const unsigned char* in_data, int n) +{ + if (!resize(n)) { + return false; + } + memcpy(nal_data, in_data, n); + data_size = n; + return true; +} + +void NAL_unit::insert_skipped_byte(int pos) +{ + skipped_bytes.push_back(pos); +} + +int NAL_unit::num_skipped_bytes_before(int byte_position, int headerLength) const +{ + for (int k=skipped_bytes.size()-1;k>=0;k--) + if (skipped_bytes[k]-headerLength <= byte_position) { + return k+1; + } + + return 0; +} + +void NAL_unit::remove_stuffing_bytes() +{ + uint8_t* p = data(); + + for (int i=0;i 0) { + nal = NAL_free_list.back(); + NAL_free_list.pop_back(); + } + else { + nal = new NAL_unit; + } + + nal->clear(); + if (!nal->resize(size)) { + free_NAL_unit(nal); + return NULL; + } + + return nal; +} + +void NAL_Parser::free_NAL_unit(NAL_unit* nal) +{ + if (nal == NULL) { + // Allow calling with NULL just like regular "free()" + return; + } + if (NAL_free_list.size() < DE265_NAL_FREE_LIST_SIZE) { + NAL_free_list.push_back(nal); + } + else { + delete nal; + } +} + +NAL_unit* NAL_Parser::pop_from_NAL_queue() +{ + if (NAL_queue.empty()) { + return NULL; + } + else { + NAL_unit* nal = NAL_queue.front(); + NAL_queue.pop(); + + nBytes_in_NAL_queue -= nal->size(); + + return nal; + } +} + +void NAL_Parser::push_to_NAL_queue(NAL_unit* nal) +{ + NAL_queue.push(nal); + nBytes_in_NAL_queue += nal->size(); +} + +de265_error NAL_Parser::push_data(const unsigned char* data, int len, + de265_PTS pts, void* user_data) +{ + end_of_frame = false; + + if (pending_input_NAL == NULL) { + pending_input_NAL = alloc_NAL_unit(len+3); + if (pending_input_NAL == NULL) { + return DE265_ERROR_OUT_OF_MEMORY; + } + pending_input_NAL->pts = pts; + pending_input_NAL->user_data = user_data; + } + + NAL_unit* nal = pending_input_NAL; // shortcut + + // Resize output buffer so that complete input would fit. + // We add 3, because in the worst case 3 extra bytes are created for an input byte. + if (!nal->resize(nal->size() + len + 3)) { + return DE265_ERROR_OUT_OF_MEMORY; + } + + unsigned char* out = nal->data() + nal->size(); + + for (int i=0;iinput_push_state, *data, data, + out - ctx->nal_data.data); + */ + + switch (input_push_state) { + case 0: + case 1: + if (*data == 0) { input_push_state++; } + else { input_push_state=0; } + break; + case 2: + if (*data == 1) { input_push_state=3; } // nal->clear_skipped_bytes(); } + else if (*data == 0) { } // *out++ = 0; } + else { input_push_state=0; } + break; + case 3: + *out++ = *data; + input_push_state = 4; + break; + case 4: + *out++ = *data; + input_push_state = 5; + break; + + case 5: + if (*data==0) { input_push_state=6; } + else { *out++ = *data; } + break; + + case 6: + if (*data==0) { input_push_state=7; } + else { + *out++ = 0; + *out++ = *data; + input_push_state=5; + } + break; + + case 7: + if (*data==0) { *out++ = 0; } + else if (*data==3) { + *out++ = 0; *out++ = 0; input_push_state=5; + + // remember which byte we removed + nal->insert_skipped_byte((out - nal->data()) + nal->num_skipped_bytes()); + } + else if (*data==1) { + +#if DEBUG_INSERT_STREAM_ERRORS + if ((rand()%100)<90 && nal_data.size>0) { + int pos = rand()%nal_data.size; + int bit = rand()%8; + nal->nal_data.data[pos] ^= 1<set_size(out - nal->data());; + + // push this NAL decoder queue + push_to_NAL_queue(nal); + + + // initialize new, empty NAL unit + + pending_input_NAL = alloc_NAL_unit(len+3); + if (pending_input_NAL == NULL) { + return DE265_ERROR_OUT_OF_MEMORY; + } + pending_input_NAL->pts = pts; + pending_input_NAL->user_data = user_data; + nal = pending_input_NAL; + out = nal->data(); + + input_push_state=3; + //nal->clear_skipped_bytes(); + } + else { + *out++ = 0; + *out++ = 0; + *out++ = *data; + + input_push_state=5; + } + break; + } + + data++; + } + + nal->set_size(out - nal->data()); + return DE265_OK; +} + + +de265_error NAL_Parser::push_NAL(const unsigned char* data, int len, + de265_PTS pts, void* user_data) +{ + + // Cannot use byte-stream input and NAL input at the same time. + assert(pending_input_NAL == NULL); + + end_of_frame = false; + + NAL_unit* nal = alloc_NAL_unit(len); + if (nal == NULL || !nal->set_data(data, len)) { + free_NAL_unit(nal); + return DE265_ERROR_OUT_OF_MEMORY; + } + nal->pts = pts; + nal->user_data = user_data; + + nal->remove_stuffing_bytes(); + + push_to_NAL_queue(nal); + + return DE265_OK; +} + + +de265_error NAL_Parser::flush_data() +{ + if (pending_input_NAL) { + NAL_unit* nal = pending_input_NAL; + uint8_t null[2] = { 0,0 }; + + // append bytes that are implied by the push state + + if (input_push_state==6) { + if (!nal->append(null,1)) { + return DE265_ERROR_OUT_OF_MEMORY; + } + } + if (input_push_state==7) { + if (!nal->append(null,2)) { + return DE265_ERROR_OUT_OF_MEMORY; + } + } + + + // only push the NAL if it contains at least the NAL header + + if (input_push_state>=5) { + push_to_NAL_queue(nal); + pending_input_NAL = NULL; + } + + input_push_state = 0; + } + + return DE265_OK; +} + + +void NAL_Parser::remove_pending_input_data() +{ + // --- remove pending input data --- + + if (pending_input_NAL) { + free_NAL_unit(pending_input_NAL); + pending_input_NAL = NULL; + } + + for (;;) { + NAL_unit* nal = pop_from_NAL_queue(); + if (nal) { free_NAL_unit(nal); } + else break; + } + + input_push_state = 0; + nBytes_in_NAL_queue = 0; +} diff --git a/nal.cc b/nal.cc new file mode 100644 index 0000000..380f04d --- /dev/null +++ b/nal.cc @@ -0,0 +1,166 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "nal.h" +#include "cabac.h" +#include + + +void nal_header::read(bitreader* reader) +{ + skip_bits(reader,1); + nal_unit_type = get_bits(reader,6); + nuh_layer_id = get_bits(reader,6); + nuh_temporal_id = get_bits(reader,3) -1; +} + + +void nal_header::write(CABAC_encoder& out) const +{ + out.skip_bits(1); + out.write_bits(nal_unit_type,6); + out.write_bits(nuh_layer_id ,6); + out.write_bits(nuh_temporal_id+1,3); +} + + +bool isIDR(uint8_t unit_type) +{ + return (unit_type == NAL_UNIT_IDR_W_RADL || + unit_type == NAL_UNIT_IDR_N_LP); +} + +bool isBLA(uint8_t unit_type) +{ + return (unit_type == NAL_UNIT_BLA_W_LP || + unit_type == NAL_UNIT_BLA_W_RADL || + unit_type == NAL_UNIT_BLA_N_LP); +} + +bool isCRA(uint8_t unit_type) +{ + return unit_type == NAL_UNIT_CRA_NUT; +} + +bool isRAP(uint8_t unit_type) +{ + return isIDR(unit_type) || isBLA(unit_type) || isCRA(unit_type); +} + +bool isRASL(uint8_t unit_type) +{ + return (unit_type == NAL_UNIT_RASL_N || + unit_type == NAL_UNIT_RASL_R); +} + +bool isIRAP(uint8_t unit_type) +{ + return (unit_type >= NAL_UNIT_BLA_W_LP && + unit_type <= NAL_UNIT_RESERVED_IRAP_VCL23); +} + +bool isRADL(uint8_t unit_type) +{ + return (unit_type == NAL_UNIT_RADL_N || + unit_type == NAL_UNIT_RADL_R); +} + + +bool isReferenceNALU(uint8_t unit_type) +{ + return ( ((unit_type <= NAL_UNIT_RESERVED_VCL_R15) && (unit_type%2 != 0)) || + ((unit_type >= NAL_UNIT_BLA_W_LP) && + (unit_type <= NAL_UNIT_RESERVED_IRAP_VCL23)) ); +} + +bool isSublayerNonReference(uint8_t unit_type) +{ + switch (unit_type) { + case NAL_UNIT_TRAIL_N: + case NAL_UNIT_TSA_N: + case NAL_UNIT_STSA_N: + case NAL_UNIT_RADL_N: + case NAL_UNIT_RASL_N: + case NAL_UNIT_RESERVED_VCL_N10: + case NAL_UNIT_RESERVED_VCL_N12: + case NAL_UNIT_RESERVED_VCL_N14: + return true; + + default: + return false; + } +} + +static const char* NAL_unit_name[] = { + "TRAIL_N", // 0 + "TRAIL_R", + "TSA_N", + "TSA_R", + "STSA_N", + "STSA_R", // 5 + "RADL_N", + "RADL_R", + "RASL_N", + "RASL_R", + "RESERVED_VCL_N10", // 10 + "RESERVED_VCL_R11", + "RESERVED_VCL_N12", + "RESERVED_VCL_R13", + "RESERVED_VCL_N14", + "RESERVED_VCL_R15", // 15 + "BLA_W_LP", + "BLA_W_RADL", + "BLA_N_LP", + "IDR_W_RADL", + "IDR_N_LP", // 20 + "CRA_NUT", + "RESERVED_IRAP_VCL22", + "RESERVED_IRAP_VCL23", + "RESERVED_VCL24", + "RESERVED_VCL25", // 25 + "RESERVED_VCL26", + "RESERVED_VCL27", + "RESERVED_VCL28", + "RESERVED_VCL29", + "RESERVED_VCL30", // 30 + "RESERVED_VCL31", + "VPS", + "SPS", + "PPS", + "AUD", // 35 + "EOS", + "EOB", + "FD", + "PREFIX_SEI", + "SUFFIX_SEI", // 40 + "RESERVED_NVCL41", + "RESERVED_NVCL42", + "RESERVED_NVCL43", + "RESERVED_NVCL44", + "RESERVED_NVCL45", // 45 + "RESERVED_NVCL46", + "RESERVED_NVCL47" +}; + +const char* get_NAL_name(uint8_t unit_type) +{ + if (unit_type >= 48) { return "INVALID NAL >= 48"; } + return NAL_unit_name[unit_type]; +} diff --git a/pps.cc b/pps.cc new file mode 100644 index 0000000..de3bcda --- /dev/null +++ b/pps.cc @@ -0,0 +1,992 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "pps.h" +#include "decctx.h" +#include "util.h" + +#include +#include +#include +#if defined(_MSC_VER) || defined(__MINGW32__) +# include +#elif defined(HAVE_ALLOCA_H) +# include +#endif + + +void pps_range_extension::reset() +{ + log2_max_transform_skip_block_size = 2; + cross_component_prediction_enabled_flag = false; + chroma_qp_offset_list_enabled_flag = false; + diff_cu_chroma_qp_offset_depth = 0; + chroma_qp_offset_list_len = 0; + log2_sao_offset_scale_luma = 0; + log2_sao_offset_scale_chroma = 0; +} + + +bool pps_range_extension::read(bitreader* br, decoder_context* ctx, const pic_parameter_set* pps) +{ + const seq_parameter_set* sps = ctx->get_sps(pps->seq_parameter_set_id); + + int uvlc; + + if (pps->transform_skip_enabled_flag) { + uvlc = get_uvlc(br); + if (uvlc == UVLC_ERROR || + uvlc+2 > sps->Log2MaxTrafoSize) { + + // Note: this is out of spec, but the conformance stream + // PERSIST_RPARAM_A_RExt_Sony_2 codes a too large value. + + //ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); + //return false; + } + + log2_max_transform_skip_block_size = uvlc+2; + } + + cross_component_prediction_enabled_flag = get_bits(br,1); + if (sps->ChromaArrayType != CHROMA_444 && + cross_component_prediction_enabled_flag) { + ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); + } + + chroma_qp_offset_list_enabled_flag = get_bits(br,1); + if (sps->ChromaArrayType == CHROMA_MONO && + chroma_qp_offset_list_enabled_flag) { + ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); + } + + if (chroma_qp_offset_list_enabled_flag) { + uvlc = get_uvlc(br); + if (uvlc == UVLC_ERROR || + uvlc > sps->log2_diff_max_min_luma_coding_block_size) { + ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); + return false; + } + + diff_cu_chroma_qp_offset_depth = uvlc; + + + uvlc = get_uvlc(br); + if (uvlc == UVLC_ERROR || + uvlc > 5) { + ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); + return false; + } + + chroma_qp_offset_list_len = uvlc+1; + + for (int i=0;i 12) { + ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); + return false; + } + + cb_qp_offset_list[i] = svlc; + + svlc = get_svlc(br); + if (svlc == UVLC_ERROR || + svlc < -12 || svlc > 12) { + ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); + return false; + } + + cr_qp_offset_list[i] = svlc; + } + } + + + uvlc = get_uvlc(br); + if (uvlc == UVLC_ERROR || + uvlc > libde265_max(0, sps->BitDepth_Y-10)) { + ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); + return false; + } + + log2_sao_offset_scale_luma = uvlc; + + uvlc = get_uvlc(br); + if (uvlc == UVLC_ERROR || + uvlc > libde265_max(0, sps->BitDepth_C-10)) { + ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); + return false; + } + + log2_sao_offset_scale_chroma = uvlc; + + return true; +} + + +void pps_range_extension::dump(int fd) const +{ + FILE* fh; + if (fd==1) fh=stdout; + else if (fd==2) fh=stderr; + else { return; } + +#define LOG0(t) log2fh(fh, t) +#define LOG1(t,d) log2fh(fh, t,d) +#define LOG2(t,d,e) log2fh(fh, t,d,e) + + LOG0("---------- PPS range-extension ----------\n"); + LOG1("log2_max_transform_skip_block_size : %d\n", log2_max_transform_skip_block_size); + LOG1("cross_component_prediction_enabled_flag : %d\n", cross_component_prediction_enabled_flag); + LOG1("chroma_qp_offset_list_enabled_flag : %d\n", chroma_qp_offset_list_enabled_flag); + if (chroma_qp_offset_list_enabled_flag) { + LOG1("diff_cu_chroma_qp_offset_depth : %d\n", diff_cu_chroma_qp_offset_depth); + LOG1("chroma_qp_offset_list_len : %d\n", chroma_qp_offset_list_len); + for (int i=0;i= DE265_MAX_PPS_SETS || + uvlc == UVLC_ERROR) { + ctx->add_warning(DE265_WARNING_NONEXISTING_PPS_REFERENCED, false); + return false; + } + + seq_parameter_set_id = uvlc = get_uvlc(br); + if (uvlc >= DE265_MAX_SPS_SETS || + uvlc == UVLC_ERROR) { + ctx->add_warning(DE265_WARNING_NONEXISTING_SPS_REFERENCED, false); + return false; + } + + dependent_slice_segments_enabled_flag = get_bits(br,1); + output_flag_present_flag = get_bits(br,1); + num_extra_slice_header_bits = get_bits(br,3); + sign_data_hiding_flag = get_bits(br,1); + cabac_init_present_flag = get_bits(br,1); + num_ref_idx_l0_default_active = uvlc = get_uvlc(br); + if (uvlc == UVLC_ERROR) { + ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); + return false; + } + num_ref_idx_l0_default_active++; + + num_ref_idx_l1_default_active = uvlc = get_uvlc(br); + if (uvlc == UVLC_ERROR) { + ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); + return false; + } + num_ref_idx_l1_default_active++; + + + if (!ctx->has_sps(seq_parameter_set_id)) { + ctx->add_warning(DE265_WARNING_NONEXISTING_SPS_REFERENCED, false); + return false; + } + + sps = ctx->get_shared_sps(seq_parameter_set_id); + + if ((pic_init_qp = get_svlc(br)) == UVLC_ERROR) { + ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); + return false; + } + pic_init_qp += 26; + + constrained_intra_pred_flag = get_bits(br,1); + transform_skip_enabled_flag = get_bits(br,1); + cu_qp_delta_enabled_flag = get_bits(br,1); + + if (cu_qp_delta_enabled_flag) { + if ((diff_cu_qp_delta_depth = get_uvlc(br)) == UVLC_ERROR) { + ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); + return false; + } + } else { + diff_cu_qp_delta_depth = 0; + } + + if ((pic_cb_qp_offset = get_svlc(br)) == UVLC_ERROR) { + ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); + return false; + } + + if ((pic_cr_qp_offset = get_svlc(br)) == UVLC_ERROR) { + ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); + return false; + } + + pps_slice_chroma_qp_offsets_present_flag = get_bits(br,1); + weighted_pred_flag = get_bits(br,1); + weighted_bipred_flag = get_bits(br,1); + transquant_bypass_enable_flag = get_bits(br,1); + tiles_enabled_flag = get_bits(br,1); + entropy_coding_sync_enabled_flag = get_bits(br,1); + + + // --- tiles --- + + if (tiles_enabled_flag) { + num_tile_columns = get_uvlc(br); + if (num_tile_columns == UVLC_ERROR || + num_tile_columns+1 > DE265_MAX_TILE_COLUMNS) { + ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); + return false; + } + num_tile_columns++; + + num_tile_rows = get_uvlc(br); + if (num_tile_rows == UVLC_ERROR || + num_tile_rows+1 > DE265_MAX_TILE_ROWS) { + ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); + return false; + } + num_tile_rows++; + + uniform_spacing_flag = get_bits(br,1); + + if (uniform_spacing_flag==false) { + int lastColumnWidth = sps->PicWidthInCtbsY; + int lastRowHeight = sps->PicHeightInCtbsY; + + for (int i=0; iadd_warning(DE265_WARNING_PPS_HEADER_INVALID, false); + return false; + } + colWidth[i]++; + + lastColumnWidth -= colWidth[i]; + } + + if (lastColumnWidth <= 0) { + return false; + } + + colWidth[num_tile_columns-1] = lastColumnWidth; + + for (int i=0; iadd_warning(DE265_WARNING_PPS_HEADER_INVALID, false); + return false; + } + rowHeight[i]++; + lastRowHeight -= rowHeight[i]; + } + + if (lastRowHeight <= 0) { + return false; + } + + + rowHeight[num_tile_rows-1] = lastRowHeight; + } + + loop_filter_across_tiles_enabled_flag = get_bits(br,1); + + } else { + num_tile_columns = 1; + num_tile_rows = 1; + uniform_spacing_flag = 1; + loop_filter_across_tiles_enabled_flag = 0; + } + + + + // END tiles + + + + beta_offset = 0; // default value + tc_offset = 0; // default value + + pps_loop_filter_across_slices_enabled_flag = get_bits(br,1); + deblocking_filter_control_present_flag = get_bits(br,1); + if (deblocking_filter_control_present_flag) { + deblocking_filter_override_enabled_flag = get_bits(br,1); + pic_disable_deblocking_filter_flag = get_bits(br,1); + if (!pic_disable_deblocking_filter_flag) { + beta_offset = get_svlc(br); + if (beta_offset == UVLC_ERROR) { + ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); + return false; + } + beta_offset *= 2; + + tc_offset = get_svlc(br); + if (tc_offset == UVLC_ERROR) { + ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); + return false; + } + tc_offset *= 2; + } + } + else { + deblocking_filter_override_enabled_flag = 0; + pic_disable_deblocking_filter_flag = 0; + } + + + // --- scaling list --- + + pic_scaling_list_data_present_flag = get_bits(br,1); + + // check consistency: if scaling-lists are not enabled, pic_scalign_list_data_present_flag + // must be FALSE + if (sps->scaling_list_enable_flag==0 && + pic_scaling_list_data_present_flag != 0) { + ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); + return false; + } + + if (pic_scaling_list_data_present_flag) { + de265_error err = read_scaling_list(br, sps.get(), &scaling_list, true); + if (err != DE265_OK) { + ctx->add_warning(err, false); + return false; + } + } + else { + memcpy(&scaling_list, &sps->scaling_list, sizeof(scaling_list_data)); + } + + + + + lists_modification_present_flag = get_bits(br,1); + log2_parallel_merge_level = get_uvlc(br); + if (log2_parallel_merge_level == UVLC_ERROR) { + ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); + return false; + } + log2_parallel_merge_level += 2; + + if (log2_parallel_merge_level-2 > sps->log2_min_luma_coding_block_size-3 +1 + + sps->log2_diff_max_min_luma_coding_block_size) { + return false; + } + + slice_segment_header_extension_present_flag = get_bits(br,1); + pps_extension_flag = get_bits(br,1); + + if (pps_extension_flag) { + pps_range_extension_flag = get_bits(br,1); + pps_multilayer_extension_flag = get_bits(br,1); + pps_extension_6bits = get_bits(br,6); + + if (pps_range_extension_flag) { + bool success = range_extension.read(br, ctx, this); + if (!success) { + return false; + } + } + + //assert(false); + /* + while( more_rbsp_data() ) + + pps_extension_data_flag + u(1) + rbsp_trailing_bits() + + } + */ + } + + + set_derived_values(sps.get()); + + pps_read = true; + + return true; +} + + +void pic_parameter_set::set_derived_values(const seq_parameter_set* sps) +{ + Log2MinCuQpDeltaSize = sps->Log2CtbSizeY - diff_cu_qp_delta_depth; + + Log2MinCuChromaQpOffsetSize = sps->Log2CtbSizeY - range_extension.diff_cu_chroma_qp_offset_depth; + Log2MaxTransformSkipSize = range_extension.log2_max_transform_skip_block_size; + + if (uniform_spacing_flag) { + + // set columns widths + + int *const colPos = (int *)alloca((num_tile_columns+1) * sizeof(int)); + + for (int i=0;i<=num_tile_columns;i++) { + colPos[i] = i*sps->PicWidthInCtbsY / num_tile_columns; + } + for (int i=0;iPicHeightInCtbsY / num_tile_rows; + } + for (int i=0;iPicSizeInCtbsY); + CtbAddrTStoRS.resize(sps->PicSizeInCtbsY); + TileId .resize(sps->PicSizeInCtbsY); + TileIdRS .resize(sps->PicSizeInCtbsY); + MinTbAddrZS .resize(sps->PicSizeInTbsY ); + + + // raster scan (RS) <-> tile scan (TS) conversion + + for (int ctbAddrRS=0 ; ctbAddrRS < sps->PicSizeInCtbsY ; ctbAddrRS++) + { + int tbX = ctbAddrRS % sps->PicWidthInCtbsY; + int tbY = ctbAddrRS / sps->PicWidthInCtbsY; + int tileX=-1,tileY=-1; + + for (int i=0;i= colBd[i]) + tileX=i; + + for (int j=0;j= rowBd[j]) + tileY=j; + + CtbAddrRStoTS[ctbAddrRS] = 0; + for (int i=0;iCtbAddrRStoTS[ctbAddrRS] += (tbY - pps->rowBd[tileY])*pps->colWidth[tileX]; + //pps->CtbAddrRStoTS[ctbAddrRS] += tbX - pps->colBd[tileX]; + + CtbAddrRStoTS[ctbAddrRS] += sps->PicWidthInCtbsY * rowHeight[j]; + } + + assert(tileX>=0 && tileY>=0); + + CtbAddrRStoTS[ctbAddrRS] += (tbY-rowBd[tileY])*colWidth[tileX]; + CtbAddrRStoTS[ctbAddrRS] += tbX - colBd[tileX]; + + + // inverse mapping + + CtbAddrTStoRS[ CtbAddrRStoTS[ctbAddrRS] ] = ctbAddrRS; + } + + +#if 0 + logtrace(LogHeaders,"6.5.1 CtbAddrRSToTS\n"); + for (int y=0;yPicHeightInCtbsY;y++) + { + for (int x=0;xPicWidthInCtbsY;x++) + { + logtrace(LogHeaders,"%3d ", CtbAddrRStoTS[x + y*sps->PicWidthInCtbsY]); + } + + logtrace(LogHeaders,"\n"); + } +#endif + + // tile id + + for (int j=0, tIdx=0 ; jPicWidthInCtbsY + x] ] = tIdx; + TileIdRS[ y*sps->PicWidthInCtbsY + x ] = tIdx; + + //logtrace(LogHeaders,"tileID[%d,%d] = %d\n",x,y,pps->TileIdRS[ y*sps->PicWidthInCtbsY + x ]); + } + + tIdx++; + } + +#if 0 + logtrace(LogHeaders,"Tile IDs RS:\n"); + for (int y=0;yPicHeightInCtbsY;y++) { + for (int x=0;xPicWidthInCtbsY;x++) { + logtrace(LogHeaders,"%2d ",TileIdRS[y*sps->PicWidthInCtbsY+x]); + } + logtrace(LogHeaders,"\n"); + } +#endif + + // 6.5.2 Z-scan order array initialization process + + for (int y=0;yPicHeightInTbsY;y++) + for (int x=0;xPicWidthInTbsY;x++) + { + int tbX = (x<Log2MinTrafoSize)>>sps->Log2CtbSizeY; + int tbY = (y<Log2MinTrafoSize)>>sps->Log2CtbSizeY; + int ctbAddrRS = sps->PicWidthInCtbsY*tbY + tbX; + + MinTbAddrZS[x + y*sps->PicWidthInTbsY] = CtbAddrRStoTS[ctbAddrRS] + << ((sps->Log2CtbSizeY-sps->Log2MinTrafoSize)*2); + + int p=0; + for (int i=0 ; i<(sps->Log2CtbSizeY - sps->Log2MinTrafoSize) ; i++) { + int m=1<PicWidthInTbsY] += p; + } + + + // --- debug logging --- + + /* + logtrace(LogHeaders,"6.5.2 Z-scan order array\n"); + for (int y=0;yPicHeightInTbsY;y++) + { + for (int x=0;xPicWidthInTbsY;x++) + { + logtrace(LogHeaders,"%4d ", pps->MinTbAddrZS[x + y*sps->PicWidthInTbsY]); + } + + logtrace(LogHeaders,"\n"); + } + + for (int i=0;iPicSizeInTbsY;i++) + { + for (int y=0;yPicHeightInTbsY;y++) + { + for (int x=0;xPicWidthInTbsY;x++) + { + if (pps->MinTbAddrZS[x + y*sps->PicWidthInTbsY] == i) { + logtrace(LogHeaders,"%d %d\n",x,y); + } + } + } + } + */ +} + + +bool pic_parameter_set::write(error_queue* errqueue, CABAC_encoder& out, + const seq_parameter_set* sps) +{ + if (pic_parameter_set_id >= DE265_MAX_PPS_SETS) { + errqueue->add_warning(DE265_WARNING_NONEXISTING_PPS_REFERENCED, false); + return false; + } + out.write_uvlc(pic_parameter_set_id); + + if (seq_parameter_set_id >= DE265_MAX_PPS_SETS) { + errqueue->add_warning(DE265_WARNING_NONEXISTING_SPS_REFERENCED, false); + return false; + } + out.write_uvlc(seq_parameter_set_id); + + out.write_bit(dependent_slice_segments_enabled_flag); + out.write_bit(output_flag_present_flag); + out.write_bits(num_extra_slice_header_bits,3); + out.write_bit(sign_data_hiding_flag); + out.write_bit(cabac_init_present_flag); + out.write_uvlc(num_ref_idx_l0_default_active-1); + out.write_uvlc(num_ref_idx_l1_default_active-1); + + out.write_svlc(pic_init_qp-26); + + out.write_bit(constrained_intra_pred_flag); + out.write_bit(transform_skip_enabled_flag); + out.write_bit(cu_qp_delta_enabled_flag); + + if (cu_qp_delta_enabled_flag) { + out.write_uvlc(diff_cu_qp_delta_depth); + } + + out.write_svlc(pic_cb_qp_offset); + out.write_svlc(pic_cr_qp_offset); + + out.write_bit(pps_slice_chroma_qp_offsets_present_flag); + out.write_bit(weighted_pred_flag); + out.write_bit(weighted_bipred_flag); + out.write_bit(transquant_bypass_enable_flag); + out.write_bit(tiles_enabled_flag); + out.write_bit(entropy_coding_sync_enabled_flag); + + + // --- tiles --- + + if (tiles_enabled_flag) { + if (num_tile_columns > DE265_MAX_TILE_COLUMNS) { + errqueue->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); + return false; + } + out.write_uvlc(num_tile_columns-1); + + if (num_tile_rows > DE265_MAX_TILE_ROWS) { + errqueue->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); + return false; + } + out.write_uvlc(num_tile_rows-1); + + out.write_bit(uniform_spacing_flag); + + if (uniform_spacing_flag==false) { + for (int i=0; iscaling_list_enable_flag==0 && + pic_scaling_list_data_present_flag != 0) { + errqueue->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); + return false; + } + + if (pic_scaling_list_data_present_flag) { + de265_error err = write_scaling_list(out, sps, &scaling_list, true); + if (err != DE265_OK) { + errqueue->add_warning(err, false); + return false; + } + } + + + + out.write_bit(lists_modification_present_flag); + out.write_uvlc(log2_parallel_merge_level-2); + + out.write_bit(slice_segment_header_extension_present_flag); + out.write_bit(pps_extension_flag); + + if (pps_extension_flag) { + //assert(false); + /* + while( more_rbsp_data() ) + + pps_extension_data_flag + u(1) + rbsp_trailing_bits() + + } + */ + } + + + pps_read = true; + + return true; +} + + +void pic_parameter_set::dump(int fd) const +{ + FILE* fh; + if (fd==1) fh=stdout; + else if (fd==2) fh=stderr; + else { return; } + +#define LOG0(t) log2fh(fh, t) +#define LOG1(t,d) log2fh(fh, t,d) + + LOG0("----------------- PPS -----------------\n"); + LOG1("pic_parameter_set_id : %d\n", pic_parameter_set_id); + LOG1("seq_parameter_set_id : %d\n", seq_parameter_set_id); + LOG1("dependent_slice_segments_enabled_flag : %d\n", dependent_slice_segments_enabled_flag); + LOG1("sign_data_hiding_flag : %d\n", sign_data_hiding_flag); + LOG1("cabac_init_present_flag : %d\n", cabac_init_present_flag); + LOG1("num_ref_idx_l0_default_active : %d\n", num_ref_idx_l0_default_active); + LOG1("num_ref_idx_l1_default_active : %d\n", num_ref_idx_l1_default_active); + + LOG1("pic_init_qp : %d\n", pic_init_qp); + LOG1("constrained_intra_pred_flag: %d\n", constrained_intra_pred_flag); + LOG1("transform_skip_enabled_flag: %d\n", transform_skip_enabled_flag); + LOG1("cu_qp_delta_enabled_flag : %d\n", cu_qp_delta_enabled_flag); + + if (cu_qp_delta_enabled_flag) { + LOG1("diff_cu_qp_delta_depth : %d\n", diff_cu_qp_delta_depth); + } + + LOG1("pic_cb_qp_offset : %d\n", pic_cb_qp_offset); + LOG1("pic_cr_qp_offset : %d\n", pic_cr_qp_offset); + LOG1("pps_slice_chroma_qp_offsets_present_flag : %d\n", pps_slice_chroma_qp_offsets_present_flag); + LOG1("weighted_pred_flag : %d\n", weighted_pred_flag); + LOG1("weighted_bipred_flag : %d\n", weighted_bipred_flag); + LOG1("output_flag_present_flag : %d\n", output_flag_present_flag); + LOG1("transquant_bypass_enable_flag: %d\n", transquant_bypass_enable_flag); + LOG1("tiles_enabled_flag : %d\n", tiles_enabled_flag); + LOG1("entropy_coding_sync_enabled_flag: %d\n", entropy_coding_sync_enabled_flag); + + if (tiles_enabled_flag) { + LOG1("num_tile_columns : %d\n", num_tile_columns); + LOG1("num_tile_rows : %d\n", num_tile_rows); + LOG1("uniform_spacing_flag: %d\n", uniform_spacing_flag); + + LOG0("tile column boundaries: "); + for (int i=0;i<=num_tile_columns;i++) { + LOG1("*%d ",colBd[i]); + } + LOG0("*\n"); + + LOG0("tile row boundaries: "); + for (int i=0;i<=num_tile_rows;i++) { + LOG1("*%d ",rowBd[i]); + } + LOG0("*\n"); + + //if( !uniform_spacing_flag ) { + /* + for( i = 0; i < num_tile_columns_minus1; i++ ) + + column_width_minus1[i] + ue(v) + for( i = 0; i < num_tile_rows_minus1; i++ ) + + row_height_minus1[i] + ue(v) + } + */ + + LOG1("loop_filter_across_tiles_enabled_flag : %d\n", loop_filter_across_tiles_enabled_flag); + } + + LOG1("pps_loop_filter_across_slices_enabled_flag: %d\n", pps_loop_filter_across_slices_enabled_flag); + LOG1("deblocking_filter_control_present_flag: %d\n", deblocking_filter_control_present_flag); + + if (deblocking_filter_control_present_flag) { + LOG1("deblocking_filter_override_enabled_flag: %d\n", deblocking_filter_override_enabled_flag); + LOG1("pic_disable_deblocking_filter_flag: %d\n", pic_disable_deblocking_filter_flag); + + LOG1("beta_offset: %d\n", beta_offset); + LOG1("tc_offset: %d\n", tc_offset); + } + + LOG1("pic_scaling_list_data_present_flag: %d\n", pic_scaling_list_data_present_flag); + if (pic_scaling_list_data_present_flag) { + //scaling_list_data() + } + + LOG1("lists_modification_present_flag: %d\n", lists_modification_present_flag); + LOG1("log2_parallel_merge_level : %d\n", log2_parallel_merge_level); + LOG1("num_extra_slice_header_bits : %d\n", num_extra_slice_header_bits); + LOG1("slice_segment_header_extension_present_flag : %d\n", slice_segment_header_extension_present_flag); + LOG1("pps_extension_flag : %d\n", pps_extension_flag); + LOG1("pps_range_extension_flag : %d\n", pps_range_extension_flag); + LOG1("pps_multilayer_extension_flag : %d\n", pps_multilayer_extension_flag); + LOG1("pps_extension_6bits : %d\n", pps_extension_6bits); + + LOG1("Log2MinCuQpDeltaSize : %d\n", Log2MinCuQpDeltaSize); + LOG1("Log2MinCuChromaQpOffsetSize (RExt) : %d\n", Log2MinCuChromaQpOffsetSize); + LOG1("Log2MaxTransformSkipSize (RExt) : %d\n", Log2MaxTransformSkipSize); + +#undef LOG0 +#undef LOG1 + + + if (pps_range_extension_flag) { + range_extension.dump(fd); + } +} + + +bool pic_parameter_set::is_tile_start_CTB(int ctbX,int ctbY) const +{ + // fast check + if (tiles_enabled_flag==0) { + return ctbX == 0 && ctbY == 0; + } + + for (int i=0;i + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "quality.h" +#include + + +uint32_t SSD(const uint8_t* img, int imgStride, + const uint8_t* ref, int refStride, + int width, int height) +{ + uint32_t sum=0; + + const uint8_t* iPtr = img; + const uint8_t* rPtr = ref; + + for (int y=0;yget_image_plane_at_pos(cIdx,x0,y0), img1->get_image_stride(cIdx), + img2->get_image_plane_at_pos(cIdx,x0,y0), img2->get_image_stride(cIdx), + 1< + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "refpic.h" +#include "decctx.h" +#include "util.h" + +#include +#include +#if defined(_MSC_VER) || defined(__MINGW32__) +# include +#elif defined(HAVE_ALLOCA_H) +# include +#endif + + +void ref_pic_set::reset() +{ + NumNegativePics = 0; + NumPositivePics = 0; + NumDeltaPocs = 0; + NumPocTotalCurr_shortterm_only = 0; + + for (int i=0;i& sets, // previously read sets + bool sliceRefPicSet) // is this in the slice header? +{ + // --- is this set coded in prediction mode (not possible for the first set) + + char inter_ref_pic_set_prediction_flag; + + if (idxRps != 0) { + inter_ref_pic_set_prediction_flag = get_bits(br,1); + } + else { + inter_ref_pic_set_prediction_flag = 0; + } + + + + if (inter_ref_pic_set_prediction_flag) { + int vlc; + + /* Only for the last ref_pic_set (that's the one coded in the slice header), + we can specify relative to which reference set we code the set. */ + + int delta_idx; + if (sliceRefPicSet) { // idxRps == num_short_term_ref_pic_sets) { + delta_idx = vlc = get_uvlc(br); + if (delta_idx==UVLC_ERROR) { + return false; + } + + if (delta_idx>=idxRps) { + return false; + } + + delta_idx++; + } else { + delta_idx = 1; + } + + int RIdx = idxRps - delta_idx; // this is our source set, which we will modify + assert(RIdx>=0); + + int delta_rps_sign = get_bits(br,1); + int abs_delta_rps = vlc = get_uvlc(br); + if (vlc==UVLC_ERROR) { return false; } + abs_delta_rps++; + int DeltaRPS = (delta_rps_sign ? -abs_delta_rps : abs_delta_rps); + + // bits are stored in this order: + // - all bits for negative Pocs (forward), + // - then all bits for positive Pocs (forward), + // - then bits for '0', shifting of the current picture + // in total, these are 'nDeltaPocsRIdx'+1 bits + + logtrace(LogHeaders,"predicted from %d with delta %d\n",RIdx,DeltaRPS); + + int nDeltaPocsRIdx= sets[RIdx].NumDeltaPocs; // size of source set + char *const used_by_curr_pic_flag = (char *)alloca((nDeltaPocsRIdx+1) * sizeof(char)); + char *const use_delta_flag = (char *)alloca((nDeltaPocsRIdx+1) * sizeof(char)); + + for (int j=0;j<=nDeltaPocsRIdx;j++) { + used_by_curr_pic_flag[j] = get_bits(br,1); + if (used_by_curr_pic_flag[j]) { + use_delta_flag[j] = 1; // if this frame is used, we also have to apply the delta + } else { + use_delta_flag[j] = get_bits(br,1); // otherwise, it is only optionally included + } + } + + logtrace(LogHeaders,"flags: "); + for (int j=0;j<=nDeltaPocsRIdx;j++) { + logtrace(LogHeaders,"%d ", use_delta_flag[j]); + } + logtrace(LogHeaders,"\n"); + + int nNegativeRIdx = sets[RIdx].NumNegativePics; + int nPositiveRIdx = sets[RIdx].NumPositivePics; + + // --- update list 0 (negative Poc) --- + // Iterate through all Pocs in decreasing value order (positive reverse, 0, negative forward). + + int i=0; // target index + + // positive list + for (int j=nPositiveRIdx-1;j>=0;j--) { + assert(RIdx >= 0 && RIdx < sets.size()); + assert(j>=0 && j < MAX_NUM_REF_PICS); + + int dPoc = sets[RIdx].DeltaPocS1[j] + DeltaRPS; // new delta + if (dPoc<0 && use_delta_flag[nNegativeRIdx+j]) { + if (i>= MAX_NUM_REF_PICS) { return false; } + + out_set->DeltaPocS0[i] = dPoc; + out_set->UsedByCurrPicS0[i] = used_by_curr_pic_flag[nNegativeRIdx+j]; + i++; + } + } + + // frame 0 + if (DeltaRPS<0 && use_delta_flag[nDeltaPocsRIdx]) { + if (i>= MAX_NUM_REF_PICS) { return false; } + + out_set->DeltaPocS0[i] = DeltaRPS; + out_set->UsedByCurrPicS0[i] = used_by_curr_pic_flag[nDeltaPocsRIdx]; + i++; + } + + // negative list + for (int j=0;j= MAX_NUM_REF_PICS) { return false; } + + out_set->DeltaPocS0[i] = dPoc; + out_set->UsedByCurrPicS0[i] = used_by_curr_pic_flag[j]; + i++; + } + } + + out_set->NumNegativePics = i; + + + // --- update list 1 (positive Poc) --- + // Iterate through all Pocs in increasing value order (negative reverse, 0, positive forward) + + i=0; // target index + + // negative list + for (int j=nNegativeRIdx-1;j>=0;j--) { + int dPoc = sets[RIdx].DeltaPocS0[j] + DeltaRPS; + if (dPoc>0 && use_delta_flag[j]) { + if (i>= MAX_NUM_REF_PICS) { return false; } + + out_set->DeltaPocS1[i] = dPoc; + out_set->UsedByCurrPicS1[i] = used_by_curr_pic_flag[j]; + i++; + } + } + + // frame 0 + if (DeltaRPS>0 && use_delta_flag[nDeltaPocsRIdx]) { + if (i>= MAX_NUM_REF_PICS) { return false; } + + out_set->DeltaPocS1[i] = DeltaRPS; + out_set->UsedByCurrPicS1[i] = used_by_curr_pic_flag[nDeltaPocsRIdx]; + i++; + } + + // positive list + for (int j=0;j0 && use_delta_flag[nNegativeRIdx+j]) { + if (i>= MAX_NUM_REF_PICS) { return false; } + + out_set->DeltaPocS1[i] = dPoc; + out_set->UsedByCurrPicS1[i] = used_by_curr_pic_flag[nNegativeRIdx+j]; + i++; + } + } + + out_set->NumPositivePics = i; + + } else { + + // --- first, read the number of past and future frames in this set --- + + int num_negative_pics = get_uvlc(br); + int num_positive_pics = get_uvlc(br); + + if (num_negative_pics == UVLC_ERROR || + num_positive_pics == UVLC_ERROR) { + // invalid num-ref-pics value + errqueue->add_warning(DE265_WARNING_MAX_NUM_REF_PICS_EXCEEDED, false); + return false; + } + + // total number of reference pictures may not exceed buffer capacity + if (num_negative_pics + num_positive_pics > + sps->sps_max_dec_pic_buffering[ sps->sps_max_sub_layers-1 ]) { + + out_set->NumNegativePics = 0; + out_set->NumPositivePics = 0; + out_set->NumDeltaPocs = 0; + out_set->NumPocTotalCurr_shortterm_only = 0; + + errqueue->add_warning(DE265_WARNING_MAX_NUM_REF_PICS_EXCEEDED, false); + return false; + } + + if (num_negative_pics > MAX_NUM_REF_PICS || + num_positive_pics > MAX_NUM_REF_PICS) { + errqueue->add_warning(DE265_WARNING_MAX_NUM_REF_PICS_EXCEEDED, false); + return false; + } + + out_set->NumNegativePics = num_negative_pics; + out_set->NumPositivePics = num_positive_pics; + + // --- now, read the deltas between the reference frames to fill the lists --- + + // past frames + + int lastPocS=0; + for (int i=0;iDeltaPocS0[i] = lastPocS - delta_poc_s0; + out_set->UsedByCurrPicS0[i] = used_by_curr_pic_s0_flag; + lastPocS = out_set->DeltaPocS0[i]; + } + + // future frames + + lastPocS=0; + for (int i=0;iDeltaPocS1[i] = lastPocS + delta_poc_s1; + out_set->UsedByCurrPicS1[i] = used_by_curr_pic_s1_flag; + lastPocS = out_set->DeltaPocS1[i]; + } + } + + + out_set->compute_derived_values(); + + return true; +} + + +bool write_short_term_ref_pic_set_nopred(error_queue* errqueue, + const seq_parameter_set* sps, + CABAC_encoder& out, + const ref_pic_set* in_set, // which set to write + int idxRps, // index of the set to be written + const std::vector& sets, // previously read sets + bool sliceRefPicSet) // is this in the slice header? +{ + if (idxRps != 0) { + // inter_ref_pic_set_prediction_flag + out.write_bit(0); + } + + + // --- first, write the number of past and future frames in this set --- + + out.write_uvlc(in_set->NumNegativePics); + out.write_uvlc(in_set->NumPositivePics); + + // --- now, write the deltas between the reference frames to fill the lists --- + + // past frames + + int lastPocS=0; + for (int i=0;iNumNegativePics;i++) { + int delta_poc_s0 = lastPocS - in_set->DeltaPocS0[i]; + char used_by_curr_pic_s0_flag = in_set->UsedByCurrPicS0[i]; + + assert(delta_poc_s0 >= 1); + out.write_uvlc(delta_poc_s0-1); + out.write_bit(used_by_curr_pic_s0_flag); + lastPocS = in_set->DeltaPocS0[i]; + } + + // future frames + + lastPocS=0; + for (int i=0;iNumPositivePics;i++) { + int delta_poc_s1 = in_set->DeltaPocS1[i] - lastPocS; + char used_by_curr_pic_s1_flag = in_set->UsedByCurrPicS1[i]; + + assert(delta_poc_s1 >= 1); + out.write_uvlc(delta_poc_s1-1); + out.write_bit(used_by_curr_pic_s1_flag); + lastPocS = in_set->DeltaPocS1[i]; + } + + return true; +} + + +bool write_short_term_ref_pic_set(error_queue* errqueue, + const seq_parameter_set* sps, + CABAC_encoder& out, + const ref_pic_set* in_set, // which set to write + int idxRps, // index of the set to be read + const std::vector& sets, // previously read sets + bool sliceRefPicSet) // is this in the slice header? +{ + return write_short_term_ref_pic_set_nopred(errqueue, sps, out, in_set, idxRps, sets, + sliceRefPicSet); +} + + +void dump_short_term_ref_pic_set(const ref_pic_set* set, FILE* fh) +{ + log2fh(fh,"NumDeltaPocs: %d [-:%d +:%d]\n", set->NumDeltaPocs, + set->NumNegativePics, set->NumPositivePics); + + log2fh(fh,"DeltaPocS0:"); + for (int i=0;iNumNegativePics;i++) { + if (i) { log2fh(fh,","); } + log2fh(fh," %d/%d",set->DeltaPocS0[i],set->UsedByCurrPicS0[i]); + } + log2fh(fh,"\n"); + + log2fh(fh,"DeltaPocS1:"); + for (int i=0;iNumPositivePics;i++) { + if (i) { log2fh(fh,","); } + log2fh(fh," %d/%d",set->DeltaPocS1[i],set->UsedByCurrPicS1[i]); + } + log2fh(fh,"\n"); +} + + +void dump_compact_short_term_ref_pic_set(const ref_pic_set* set, int range, FILE* fh) +{ + char *const log = (char *)alloca((range+1+range+1) * sizeof(char)); + log[2*range+1] = 0; + for (int i=0;i<2*range+1;i++) log[i]='.'; + log[range]='|'; + + for (int i=set->NumNegativePics-1;i>=0;i--) { + int n = set->DeltaPocS0[i]; + if (n>=-range) { + if (set->UsedByCurrPicS0[i]) log[n+range] = 'X'; + else log[n+range] = 'o'; + } else { log2fh(fh,"*%d%c ",n, set->UsedByCurrPicS0[i] ? 'X':'o'); } + } + + for (int i=set->NumPositivePics-1;i>=0;i--) { + int n = set->DeltaPocS1[i]; + if (n<=range) { + if (set->UsedByCurrPicS1[i]) log[n+range] = 'X'; + else log[n+range] = 'o'; + } else { log2fh(fh,"*%d%c ",n, set->UsedByCurrPicS1[i] ? 'X':'o'); } + } + + log2fh(fh,"*%s\n",log); +} diff --git a/sao.cc b/sao.cc new file mode 100644 index 0000000..f93fc02 --- /dev/null +++ b/sao.cc @@ -0,0 +1,524 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "sao.h" +#include "util.h" + +#include +#include + + +template +void apply_sao_internal(de265_image* img, int xCtb,int yCtb, + const slice_segment_header* shdr, int cIdx, int nSW,int nSH, + const pixel_t* in_img, int in_stride, + /* */ pixel_t* out_img, int out_stride) +{ + const sao_info* saoinfo = img->get_sao_info(xCtb,yCtb); + + int SaoTypeIdx = (saoinfo->SaoTypeIdx >> (2*cIdx)) & 0x3; + + logtrace(LogSAO,"apply_sao CTB %d;%d cIdx:%d type=%d (%dx%d)\n",xCtb,yCtb,cIdx, SaoTypeIdx, nSW,nSH); + + if (SaoTypeIdx==0) { + return; + } + + const seq_parameter_set* sps = &img->get_sps(); + const pic_parameter_set* pps = &img->get_pps(); + const int bitDepth = (cIdx==0 ? sps->BitDepth_Y : sps->BitDepth_C); + const int maxPixelValue = (1<get_width(cIdx); + const int height = img->get_height(cIdx); + + const int ctbSliceAddrRS = img->get_SliceHeader(xC,yC)->SliceAddrRS; + + const int picWidthInCtbs = sps->PicWidthInCtbsY; + const int chromashiftW = sps->get_chroma_shift_W(cIdx); + const int chromashiftH = sps->get_chroma_shift_H(cIdx); + const int ctbshiftW = sps->Log2CtbSizeY - chromashiftW; + const int ctbshiftH = sps->Log2CtbSizeY - chromashiftH; + + + for (int i=0;i<5;i++) + { + logtrace(LogSAO,"offset[%d] = %d\n", i, i==0 ? 0 : saoinfo->saoOffsetVal[cIdx][i-1]); + } + + + // actual size of CTB to be processed (can be smaller when partially outside of image) + const int ctbW = (xC+nSW>width) ? width -xC : nSW; + const int ctbH = (yC+nSH>height) ? height-yC : nSH; + + + const bool extendedTests = img->get_CTB_has_pcm_or_cu_transquant_bypass(xCtb,yCtb); + + if (SaoTypeIdx==2) { + int hPos[2], vPos[2]; + int vPosStride[2]; // vPos[] multiplied by image stride + int SaoEoClass = (saoinfo->SaoEoClass >> (2*cIdx)) & 0x3; + + switch (SaoEoClass) { + case 0: hPos[0]=-1; hPos[1]= 1; vPos[0]= 0; vPos[1]=0; break; + case 1: hPos[0]= 0; hPos[1]= 0; vPos[0]=-1; vPos[1]=1; break; + case 2: hPos[0]=-1; hPos[1]= 1; vPos[0]=-1; vPos[1]=1; break; + case 3: hPos[0]= 1; hPos[1]=-1; vPos[0]=-1; vPos[1]=1; break; + } + + vPosStride[0] = vPos[0] * in_stride; + vPosStride[1] = vPos[1] * in_stride; + + /* Reorder sao_info.saoOffsetVal[] array, so that we can index it + directly with the sum of the two pixel-difference signs. */ + int8_t saoOffsetVal[5]; // [2] unused + saoOffsetVal[0] = saoinfo->saoOffsetVal[cIdx][1-1]; + saoOffsetVal[1] = saoinfo->saoOffsetVal[cIdx][2-1]; + saoOffsetVal[2] = 0; + saoOffsetVal[3] = saoinfo->saoOffsetVal[cIdx][3-1]; + saoOffsetVal[4] = saoinfo->saoOffsetVal[cIdx][4-1]; + + + for (int j=0;jpcm_loop_filter_disable_flag && + img->get_pcm_flag((xC+i)<get_cu_transquant_bypass((xC+i)<=width || yS>=height) { + edgeIdx=0; + break; + } + + + // This part seems inefficient with all the get_SliceHeaderIndex() calls, + // but removing this part (because the input was known to have only a single + // slice anyway) reduced computation time only by 1.3%. + // TODO: however, this may still be a big part of SAO itself. + + slice_segment_header* sliceHeader = img->get_SliceHeader(xS<SliceAddrRS; + if (sliceAddrRS < ctbSliceAddrRS && + img->get_SliceHeader((xC+i)<slice_loop_filter_across_slices_enabled_flag==0) { + edgeIdx=0; + break; + } + + if (sliceAddrRS > ctbSliceAddrRS && + img->get_SliceHeader(xS<slice_loop_filter_across_slices_enabled_flag==0) { + edgeIdx=0; + break; + } + + + if (pps->loop_filter_across_tiles_enabled_flag==0 && + pps->TileIdRS[(xS>>ctbshiftW) + (yS>>ctbshiftH)*picWidthInCtbs] != + pps->TileIdRS[(xC>>ctbshiftW) + (yC>>ctbshiftH)*picWidthInCtbs]) { + edgeIdx=0; + break; + } + } + + if (edgeIdx != 0) { + + edgeIdx = ( Sign(in_ptr[i] - in_ptr[i+hPos[0]+vPosStride[0]]) + + Sign(in_ptr[i] - in_ptr[i+hPos[1]+vPosStride[1]]) ); + + if (1) { // edgeIdx != 0) { // seems to be faster without this check (zero in offset table) + int offset = saoOffsetVal[edgeIdx+2]; + + out_ptr[i] = Clip3(0,maxPixelValue, + in_ptr[i] + offset); + } + } + } + } + } + else { + int bandShift = bitDepth-5; + int saoLeftClass = saoinfo->sao_band_position[cIdx]; + logtrace(LogSAO,"saoLeftClass: %d\n",saoLeftClass); + + int bandTable[32]; + memset(bandTable, 0, sizeof(int)*32); + + for (int k=0;k<4;k++) { + bandTable[ (k+saoLeftClass)&31 ] = k+1; + } + + + /* If PCM or transquant_bypass is used in this CTB, we have to + run all checks (A). + Otherwise, we run a simplified version of the code (B). + + NOTE: this whole part of SAO does not seem to be a significant part of the time spent + */ + + if (extendedTests) { + + // (A) full version with all checks + + for (int j=0;jpcm_loop_filter_disable_flag && + img->get_pcm_flag((xC+i)<get_cu_transquant_bypass((xC+i)<>x actually computes >>(x%64). + // So we have to take care of large bandShifts. + int bandIdx; + if (bandShift >= 8) { + bandIdx = 0; + } else { + bandIdx = bandTable[ in_img[xC+i+(yC+j)*in_stride]>>bandShift ]; + } + + if (bandIdx>0) { + int offset = saoinfo->saoOffsetVal[cIdx][bandIdx-1]; + + logtrace(LogSAO,"%d %d (%d) offset %d %x -> %x\n",xC+i,yC+j,bandIdx, + offset, + in_img[xC+i+(yC+j)*in_stride], + in_img[xC+i+(yC+j)*in_stride]+offset); + + out_img[xC+i+(yC+j)*out_stride] = Clip3(0,maxPixelValue, + in_img[xC+i+(yC+j)*in_stride] + offset); + } + } + } + else + { + // (B) simplified version (only works if no PCM and transquant_bypass is active) + + for (int j=0;j= 8) { + bandIdx = 0; + } else { + bandIdx = bandTable[ in_img[xC+i+(yC+j)*in_stride]>>bandShift ]; + } + + if (bandIdx>0) { + int offset = saoinfo->saoOffsetVal[cIdx][bandIdx-1]; + + out_img[xC+i+(yC+j)*out_stride] = Clip3(0,maxPixelValue, + in_img[xC+i+(yC+j)*in_stride] + offset); + } + } + } + } +} + + +template +void apply_sao(de265_image* img, int xCtb,int yCtb, + const slice_segment_header* shdr, int cIdx, int nSW,int nSH, + const pixel_t* in_img, int in_stride, + /* */ pixel_t* out_img, int out_stride) +{ + if (img->high_bit_depth(cIdx)) { + apply_sao_internal(img,xCtb,yCtb, shdr,cIdx,nSW,nSH, + (uint16_t*)in_img, in_stride, + (uint16_t*)out_img,out_stride); + } + else { + apply_sao_internal(img,xCtb,yCtb, shdr,cIdx,nSW,nSH, + in_img, in_stride, + out_img,out_stride); + } +} + + +void apply_sample_adaptive_offset(de265_image* img) +{ + const seq_parameter_set& sps = img->get_sps(); + + if (sps.sample_adaptive_offset_enabled_flag==0) { + return; + } + + de265_image inputCopy; + de265_error err = inputCopy.copy_image(img); + if (err != DE265_OK) { + img->decctx->add_warning(DE265_WARNING_CANNOT_APPLY_SAO_OUT_OF_MEMORY,false); + return; + } + + for (int yCtb=0; yCtbget_SliceHeaderCtb(xCtb,yCtb); + + if (shdr->slice_sao_luma_flag) { + apply_sao(img, xCtb,yCtb, shdr, 0, 1<get_image_plane(0), img->get_image_stride(0)); + } + + if (shdr->slice_sao_chroma_flag) { + int nSW = (1<get_image_plane(1), img->get_image_stride(1)); + + apply_sao(img, xCtb,yCtb, shdr, 2, nSW,nSH, + inputCopy.get_image_plane(2), inputCopy.get_image_stride(2), + img->get_image_plane(2), img->get_image_stride(2)); + } + } +} + + +void apply_sample_adaptive_offset_sequential(de265_image* img) +{ + const seq_parameter_set& sps = img->get_sps(); + + if (sps.sample_adaptive_offset_enabled_flag==0) { + return; + } + + int lumaImageSize = img->get_image_stride(0) * img->get_height(0) * img->get_bytes_per_pixel(0); + int chromaImageSize = img->get_image_stride(1) * img->get_height(1) * img->get_bytes_per_pixel(1); + + uint8_t* inputCopy = new uint8_t[ libde265_max(lumaImageSize, chromaImageSize) ]; + if (inputCopy == NULL) { + img->decctx->add_warning(DE265_WARNING_CANNOT_APPLY_SAO_OUT_OF_MEMORY,false); + return; + } + + + int nChannels = 3; + if (sps.ChromaArrayType == CHROMA_MONO) { nChannels=1; } + + for (int cIdx=0;cIdxget_image_stride(cIdx); + int height = img->get_height(cIdx); + + memcpy(inputCopy, img->get_image_plane(cIdx), stride * height * img->get_bytes_per_pixel(cIdx)); + + for (int yCtb=0; yCtbget_SliceHeaderCtb(xCtb,yCtb); + if (shdr==NULL) { return; } + + if (cIdx==0 && shdr->slice_sao_luma_flag) { + apply_sao(img, xCtb,yCtb, shdr, 0, 1<get_image_plane(0), img->get_image_stride(0)); + } + + if (cIdx!=0 && shdr->slice_sao_chroma_flag) { + int nSW = (1<get_image_plane(cIdx), img->get_image_stride(cIdx)); + } + } + } + + delete[] inputCopy; +} + + + + +class thread_task_sao : public thread_task +{ +public: + int ctb_y; + de265_image* img; /* this is where we get the SPS from + (either inputImg or outputImg can be a dummy image) + */ + + de265_image* inputImg; + de265_image* outputImg; + int inputProgress; + + virtual void work(); + virtual std::string name() const { + char buf[100]; + sprintf(buf,"sao-%d",ctb_y); + return buf; + } +}; + + +void thread_task_sao::work() +{ + state = Running; + img->thread_run(this); + + const seq_parameter_set& sps = img->get_sps(); + + const int rightCtb = sps.PicWidthInCtbsY-1; + const int ctbSize = (1<wait_for_progress(this, rightCtb,ctb_y, inputProgress); + + if (ctb_y>0) { + img->wait_for_progress(this, rightCtb,ctb_y-1, inputProgress); + } + + if (ctb_y+1wait_for_progress(this, rightCtb,ctb_y+1, inputProgress); + } + + + // copy input image to output for this CTB-row + + outputImg->copy_lines_from(inputImg, ctb_y * ctbSize, (ctb_y+1) * ctbSize); + + + // process SAO in the CTB-row + + for (int xCtb=0; xCtbget_SliceHeaderCtb(xCtb,ctb_y); + if (shdr==NULL) { + break; + } + + if (shdr->slice_sao_luma_flag) { + apply_sao(img, xCtb,ctb_y, shdr, 0, ctbSize, ctbSize, + inputImg ->get_image_plane(0), inputImg ->get_image_stride(0), + outputImg->get_image_plane(0), outputImg->get_image_stride(0)); + } + + if (shdr->slice_sao_chroma_flag) { + int nSW = ctbSize / sps.SubWidthC; + int nSH = ctbSize / sps.SubHeightC; + + apply_sao(img, xCtb,ctb_y, shdr, 1, nSW,nSH, + inputImg ->get_image_plane(1), inputImg ->get_image_stride(1), + outputImg->get_image_plane(1), outputImg->get_image_stride(1)); + + apply_sao(img, xCtb,ctb_y, shdr, 2, nSW,nSH, + inputImg ->get_image_plane(2), inputImg ->get_image_stride(2), + outputImg->get_image_plane(2), outputImg->get_image_stride(2)); + } + } + + + // mark SAO progress + + for (int x=0;x<=rightCtb;x++) { + const int CtbWidth = sps.PicWidthInCtbsY; + img->ctb_progress[x+ctb_y*CtbWidth].set_progress(CTB_PROGRESS_SAO); + } + + + state = Finished; + img->thread_finishes(this); +} + + +bool add_sao_tasks(image_unit* imgunit, int saoInputProgress) +{ + de265_image* img = imgunit->img; + const seq_parameter_set& sps = img->get_sps(); + + if (sps.sample_adaptive_offset_enabled_flag==0) { + return false; + } + + + decoder_context* ctx = img->decctx; + + de265_error err = imgunit->sao_output.alloc_image(img->get_width(), img->get_height(), + img->get_chroma_format(), + img->get_shared_sps(), + false, + img->decctx, //img->encctx, + img->pts, img->user_data, true); + if (err != DE265_OK) { + img->decctx->add_warning(DE265_WARNING_CANNOT_APPLY_SAO_OUT_OF_MEMORY,false); + return false; + } + + int nRows = sps.PicHeightInCtbsY; + + int n=0; + img->thread_start(nRows); + + for (int y=0;yinputImg = img; + task->outputImg = &imgunit->sao_output; + task->img = img; + task->ctb_y = y; + task->inputProgress = saoInputProgress; + + imgunit->tasks.push_back(task); + add_task(&ctx->thread_pool_, task); + n++; + } + + /* Currently need barrier here because when are finished, we have to swap the pixel + data back into the main image. */ + img->wait_for_completion(); + + img->exchange_pixel_data_with(imgunit->sao_output); + + return true; +} diff --git a/scan.cc b/scan.cc new file mode 100644 index 0000000..b29e283 --- /dev/null +++ b/scan.cc @@ -0,0 +1,152 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "scan.h" + +static position scan0 = { 0,0 }; +static position scan_h_1[ 2* 2], scan_v_1[ 2* 2], scan_d_1[ 2* 2]; +static position scan_h_2[ 4* 4], scan_v_2[ 4* 4], scan_d_2[ 4* 4]; +static position scan_h_3[ 8* 8], scan_v_3[ 8* 8], scan_d_3[ 8* 8]; +static position scan_h_4[16*16], scan_v_4[16*16], scan_d_4[16*16]; +static position scan_h_5[32*32], scan_v_5[32*32], scan_d_5[32*32]; + +static position* scan_h[7] = { &scan0,scan_h_1,scan_h_2,scan_h_3,scan_h_4,scan_h_5 }; +static position* scan_v[7] = { &scan0,scan_v_1,scan_v_2,scan_v_3,scan_v_4,scan_v_5 }; +static position* scan_d[7] = { &scan0,scan_d_1,scan_d_2,scan_d_3,scan_d_4,scan_d_5 }; + +static void init_scan_h(position* scan, int blkSize) +{ + int i=0; + for (int y=0;y=0) { + if (xsubBlock = lastSubBlock; + pos->scanPos = lastScanPos; +} + + +void init_scan_orders() +{ + for (int log2size=1;log2size<=5;log2size++) + { + init_scan_h(scan_h[log2size], 1< + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "sei.h" +#include "util.h" +#include "md5.h" + +#include "libde265/sps.h" +#include "libde265/image.h" +#include "libde265/decctx.h" + +#include + + +static de265_error read_sei_decoded_picture_hash(bitreader* reader, sei_message* sei, + const seq_parameter_set* sps) +{ + sei_decoded_picture_hash* seihash = &sei->data.decoded_picture_hash; + + seihash->hash_type = (enum sei_decoded_picture_hash_type)get_bits(reader,8); + + if (sps==NULL) { + return DE265_WARNING_SPS_MISSING_CANNOT_DECODE_SEI; + } + + int nHashes = sps->chroma_format_idc==0 ? 1 : 3; + for (int i=0;ihash_type) { + case sei_decoded_picture_hash_type_MD5: + for (int b=0;b<16;b++) { seihash->md5[i][b] = get_bits(reader,8); } + break; + + case sei_decoded_picture_hash_type_CRC: + seihash->crc[i] = get_bits(reader,16); + break; + + case sei_decoded_picture_hash_type_checksum: + seihash->checksum[i] = get_bits(reader,32); + break; + } + } + + return DE265_OK; +} + + +static void dump_sei_decoded_picture_hash(const sei_message* sei, + const seq_parameter_set* sps) +{ + const sei_decoded_picture_hash* seihash = &sei->data.decoded_picture_hash; + + loginfo(LogSEI," hash_type: "); + switch (seihash->hash_type) { + case sei_decoded_picture_hash_type_MD5: loginfo(LogSEI,"MD5\n"); break; + case sei_decoded_picture_hash_type_CRC: loginfo(LogSEI,"CRC\n"); break; + case sei_decoded_picture_hash_type_checksum: loginfo(LogSEI,"checksum\n"); break; + } + + int nHashes = sps->chroma_format_idc==0 ? 1 : 3; + for (int i=0;ihash_type) { + case sei_decoded_picture_hash_type_MD5: + loginfo(LogSEI," MD5[%d]: %02x", i,seihash->md5[i][0]); + for (int b=1;b<16;b++) { + loginfo(LogSEI,"*:%02x", seihash->md5[i][b]); + } + loginfo(LogSEI,"*\n"); + break; + + case sei_decoded_picture_hash_type_CRC: + loginfo(LogSEI," CRC[%d]: %02x\n", i,seihash->crc[i]); + break; + + case sei_decoded_picture_hash_type_checksum: + loginfo(LogSEI," checksum[%d]: %04x\n", i,seihash->checksum[i]); + break; + } + } +} + + +class raw_hash_data +{ +public: + raw_hash_data(int w, int stride); + ~raw_hash_data(); + + struct data_chunk { + const uint8_t* data; + int len; + }; + + data_chunk prepare_8bit(const uint8_t* data,int y); + data_chunk prepare_16bit(const uint8_t* data,int y); + +private: + int mWidth, mStride; + + uint8_t* mMem; +}; + + +raw_hash_data::raw_hash_data(int w, int stride) +{ + mWidth=w; + mStride=stride; + mMem = NULL; +} + +raw_hash_data::~raw_hash_data() +{ + delete[] mMem; +} + +raw_hash_data::data_chunk raw_hash_data::prepare_8bit(const uint8_t* data,int y) +{ + data_chunk chunk; + chunk.data = data+y*mStride; + chunk.len = mWidth; + return chunk; +} + +raw_hash_data::data_chunk raw_hash_data::prepare_16bit(const uint8_t* data,int y) +{ + if (mMem == NULL) { + mMem = new uint8_t[2*mWidth]; + } + + const uint16_t* data16 = (uint16_t*)data; + + for (int x=0; x> 8; + } + + data_chunk chunk; + chunk.data = mMem; + chunk.len = 2*mWidth; + return chunk; +} + + +static uint32_t compute_checksum_8bit(uint8_t* data,int w,int h,int stride, int bit_depth) +{ + uint32_t sum = 0; + + if (bit_depth<=8) { + for (int y=0; y> 8 ) ^ ( y >> 8 ); + sum += data[y*stride + x] ^ xorMask; + } + } + else { + for (int y=0; y> 8 ) ^ ( y >> 8 ); + sum += (data[y*stride + x] & 0xFF) ^ xorMask; + sum += (data[y*stride + x] >> 8) ^ xorMask; + } + } + + return sum & 0xFFFFFFFF; +} + +static inline uint16_t crc_process_byte(uint16_t crc, uint8_t byte) +{ + for (int bit=0;bit<8;bit++) { + int bitVal = (byte >> (7-bit)) & 1; + + int crcMsb = (crc>>15) & 1; + crc = (((crc<<1) + bitVal) & 0xFFFF); + + if (crcMsb) { crc ^= 0x1021; } + } + + return crc; +} + +/* +static uint16_t compute_CRC_8bit_old(const uint8_t* data,int w,int h,int stride) +{ + uint16_t crc = 0xFFFF; + + for (int y=0; y> 8); + uint16_t t = s ^ (s >> 4); + + return ((crc << 8) ^ + t ^ + (t << 5) ^ + (t << 12)) & 0xFFFF; +} + +static uint32_t compute_CRC_8bit_fast(const uint8_t* data,int w,int h,int stride, int bit_depth) +{ + raw_hash_data raw_data(w,stride); + + uint16_t crc = 0xFFFF; + + crc = crc_process_byte_parallel(crc, 0); + crc = crc_process_byte_parallel(crc, 0); + + for (int y=0; y8) + chunk = raw_data.prepare_16bit(data, y); + else + chunk = raw_data.prepare_8bit(data, y); + + for(int x=0; x8) + chunk = raw_data.prepare_16bit(data, y); + else + chunk = raw_data.prepare_8bit(data, y); + + MD5_Update(&md5, (void*)chunk.data, chunk.len); + } + + MD5_Final(result, &md5); +} + + +static de265_error process_sei_decoded_picture_hash(const sei_message* sei, de265_image* img) +{ + const sei_decoded_picture_hash* seihash = &sei->data.decoded_picture_hash; + + /* Do not check SEI on pictures that are not output. + Hash may be wrong, because of a broken link (BLA). + This happens, for example in conformance stream RAP_B, where a EOS-NAL + appears before a CRA (POC=32). */ + if (img->PicOutputFlag == false) { + return DE265_OK; + } + + //write_picture(img); + + int nHashes = img->get_sps().chroma_format_idc==0 ? 1 : 3; + for (int i=0;iget_width(i); + h = img->get_height(i); + + data = img->get_image_plane(i); + stride = img->get_image_stride(i); + + switch (seihash->hash_type) { + case sei_decoded_picture_hash_type_MD5: + { + uint8_t md5[16]; + compute_MD5(data,w,h,stride,md5, img->get_bit_depth(i)); + +/* + fprintf(stderr,"computed MD5: "); + for (int b=0;b<16;b++) { + fprintf(stderr,"%02x", md5[b]); + } + fprintf(stderr,"\n"); +*/ + + for (int b=0;b<16;b++) { + if (md5[b] != seihash->md5[i][b]) { +/* + fprintf(stderr,"SEI decoded picture MD5 mismatch (POC=%d)\n", img->PicOrderCntVal); +*/ + return DE265_ERROR_CHECKSUM_MISMATCH; + } + } + } + break; + + case sei_decoded_picture_hash_type_CRC: + { + uint16_t crc = compute_CRC_8bit_fast(data,w,h,stride, img->get_bit_depth(i)); + + logtrace(LogSEI,"SEI decoded picture hash: %04x <-[%d]-> decoded picture: %04x\n", + seihash->crc[i], i, crc); + + if (crc != seihash->crc[i]) { +/* + fprintf(stderr,"SEI decoded picture hash: %04x, decoded picture: %04x (POC=%d)\n", + seihash->crc[i], crc, img->PicOrderCntVal); +*/ + return DE265_ERROR_CHECKSUM_MISMATCH; + } + } + break; + + case sei_decoded_picture_hash_type_checksum: + { + uint32_t chksum = compute_checksum_8bit(data,w,h,stride, img->get_bit_depth(i)); + + if (chksum != seihash->checksum[i]) { +/* + fprintf(stderr,"SEI decoded picture hash: %04x, decoded picture: %04x (POC=%d)\n", + seihash->checksum[i], chksum, img->PicOrderCntVal); +*/ + return DE265_ERROR_CHECKSUM_MISMATCH; + } + } + break; + } + } + + loginfo(LogSEI,"decoded picture hash checked: OK\n"); + //printf("checked picture %d SEI: OK\n", img->PicOrderCntVal); + + return DE265_OK; +} + + +de265_error read_sei(bitreader* reader, sei_message* sei, bool suffix, const seq_parameter_set* sps) +{ + int payload_type = 0; + for (;;) + { + int byte = get_bits(reader,8); + payload_type += byte; + if (byte != 0xFF) { break; } + } + + //printf("SEI payload: %d\n",payload_type); + + int payload_size = 0; + for (;;) + { + int byte = get_bits(reader,8); + payload_size += byte; + if (byte != 0xFF) { break; } + } + + sei->payload_type = (enum sei_payload_type)payload_type; + sei->payload_size = payload_size; + + + // --- sei message dispatch + + de265_error err = DE265_OK; + + switch (sei->payload_type) { + case sei_payload_type_decoded_picture_hash: + err = read_sei_decoded_picture_hash(reader,sei,sps); + break; + + default: + // TODO: unknown SEI messages are ignored + break; + } + + return err; +} + +void dump_sei(const sei_message* sei, const seq_parameter_set* sps) +{ + loginfo(LogHeaders,"SEI message: %s\n", sei_type_name(sei->payload_type)); + + switch (sei->payload_type) { + case sei_payload_type_decoded_picture_hash: + dump_sei_decoded_picture_hash(sei, sps); + break; + + default: + // TODO: unknown SEI messages are ignored + break; + } +} + + +de265_error process_sei(const sei_message* sei, de265_image* img) +{ + de265_error err = DE265_OK; + + switch (sei->payload_type) { + case sei_payload_type_decoded_picture_hash: + if (img->decctx->param_sei_check_hash) { + err = process_sei_decoded_picture_hash(sei, img); + if (err==DE265_OK) { + //printf("SEI check ok\n"); + } + } + + break; + + default: + // TODO: unknown SEI messages are ignored + break; + } + + return err; +} + + +const char* sei_type_name(enum sei_payload_type type) +{ + switch (type) { + case sei_payload_type_buffering_period: + return "buffering_period"; + case sei_payload_type_pic_timing: + return "pic_timing"; + case sei_payload_type_pan_scan_rect: + return "pan_scan_rect"; + case sei_payload_type_filler_payload: + return "filler_payload"; + case sei_payload_type_user_data_registered_itu_t_t35: + return "user_data_registered_itu_t_t35"; + case sei_payload_type_user_data_unregistered: + return "user_data_unregistered"; + case sei_payload_type_recovery_point: + return "recovery_point"; + case sei_payload_type_scene_info: + return "scene_info"; + case sei_payload_type_picture_snapshot: + return "picture_snapshot"; + case sei_payload_type_progressive_refinement_segment_start: + return "progressive_refinement_segment_start"; + case sei_payload_type_progressive_refinement_segment_end: + return "progressive_refinement_segment_end"; + case sei_payload_type_film_grain_characteristics: + return "film_grain_characteristics"; + case sei_payload_type_post_filter_hint: + return "post_filter_hint"; + case sei_payload_type_tone_mapping_info: + return "tone_mapping_info"; + case sei_payload_type_frame_packing_arrangement: + return "frame_packing_arrangement"; + case sei_payload_type_display_orientation: + return "display_orientation"; + case sei_payload_type_structure_of_pictures_info: + return "structure_of_pictures_info"; + case sei_payload_type_active_parameter_sets: + return "active_parameter_sets"; + case sei_payload_type_decoding_unit_info: + return "decoding_unit_info"; + case sei_payload_type_temporal_sub_layer_zero_index: + return "temporal_sub_layer_zero_index"; + case sei_payload_type_decoded_picture_hash: + return "decoded_picture_hash"; + case sei_payload_type_scalable_nesting: + return "scalable_nesting"; + case sei_payload_type_region_refresh_info: + return "region_refresh_info"; + case sei_payload_type_no_display: + return "no_display"; + case sei_payload_type_motion_constrained_tile_sets: + return "motion_constrained_tile_sets"; + + default: + return "unknown SEI message"; + } +} diff --git a/slice.cc b/slice.cc new file mode 100644 index 0000000..e85ecc6 --- /dev/null +++ b/slice.cc @@ -0,0 +1,5072 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * Authors: struktur AG, Dirk Farin + * Min Chen + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "slice.h" +#include "motion.h" +#include "util.h" +#include "scan.h" +#include "intrapred.h" +#include "transform.h" +#include "threads.h" +#include "image.h" + +#include +#include +#include + + +#define LOCK de265_mutex_lock(&ctx->thread_pool.mutex) +#define UNLOCK de265_mutex_unlock(&ctx->thread_pool.mutex) + +extern bool read_short_term_ref_pic_set(error_queue* errqueue, + const seq_parameter_set* sps, + bitreader* br, + ref_pic_set* out_set, + int idxRps, // index of the set to be read + const std::vector& sets, + bool sliceRefPicSet); + + +void read_coding_tree_unit(thread_context* tctx); +void read_coding_quadtree(thread_context* tctx, + int xCtb, int yCtb, + int Log2CtbSizeY, + int ctDepth); +/* +void decode_inter_block(decoder_context* ctx,thread_context* tctx, + int xC, int yC, int log2CbSize); +*/ + +void slice_segment_header::set_defaults() +{ + slice_index = 0; + + first_slice_segment_in_pic_flag = 1; + no_output_of_prior_pics_flag = 0; + slice_pic_parameter_set_id = 0; + dependent_slice_segment_flag = 0; + slice_segment_address = 0; + + slice_type = SLICE_TYPE_I; + pic_output_flag = 1; + colour_plane_id = 0; + slice_pic_order_cnt_lsb = 0; + short_term_ref_pic_set_sps_flag = 1; + // ref_pic_set slice_ref_pic_set; + + short_term_ref_pic_set_idx = 0; + num_long_term_sps = 0; + num_long_term_pics = 0; + + //uint8_t lt_idx_sps[MAX_NUM_REF_PICS]; + //int poc_lsb_lt[MAX_NUM_REF_PICS]; + //char used_by_curr_pic_lt_flag[MAX_NUM_REF_PICS]; + + //char delta_poc_msb_present_flag[MAX_NUM_REF_PICS]; + //int delta_poc_msb_cycle_lt[MAX_NUM_REF_PICS]; + + slice_temporal_mvp_enabled_flag = 0; + slice_sao_luma_flag = 0; + slice_sao_chroma_flag = 0; + + num_ref_idx_active_override_flag = 0; + num_ref_idx_l0_active=1; // [1;16] + num_ref_idx_l1_active=1; // [1;16] + + ref_pic_list_modification_flag_l0 = 0; + ref_pic_list_modification_flag_l1 = 0; + //uint8_t list_entry_l0[16]; + //uint8_t list_entry_l1[16]; + + mvd_l1_zero_flag = 0; + cabac_init_flag = 0; + collocated_from_l0_flag = 0; + collocated_ref_idx = 0; + + // --- pred_weight_table --- + + luma_log2_weight_denom=0; // [0;7] + ChromaLog2WeightDenom=0; // [0;7] + + // first index is L0/L1 + /* + uint8_t luma_weight_flag[2][16]; // bool + uint8_t chroma_weight_flag[2][16]; // bool + int16_t LumaWeight[2][16]; + int8_t luma_offset[2][16]; + int16_t ChromaWeight[2][16][2]; + int8_t ChromaOffset[2][16][2]; + */ + + + five_minus_max_num_merge_cand = 0; + slice_qp_delta = 0; + + slice_cb_qp_offset = 0; + slice_cr_qp_offset = 0; + + cu_chroma_qp_offset_enabled_flag = 0; + + deblocking_filter_override_flag = 0; + slice_deblocking_filter_disabled_flag = 0; + slice_beta_offset=0; // = pps->beta_offset if undefined + slice_tc_offset=0; // = pps->tc_offset if undefined + + slice_loop_filter_across_slices_enabled_flag = 0; + + num_entry_point_offsets = 0; + //int offset_len; + //std::vector entry_point_offset; + + slice_segment_header_extension_length = 0; + + SliceAddrRS = slice_segment_address; +} + + +bool read_pred_weight_table(bitreader* br, slice_segment_header* shdr, decoder_context* ctx) +{ + int vlc; + + pic_parameter_set* pps = ctx->get_pps((int)shdr->slice_pic_parameter_set_id); + assert(pps); + seq_parameter_set* sps = ctx->get_sps((int)pps->seq_parameter_set_id); + assert(sps); + + shdr->luma_log2_weight_denom = vlc = get_uvlc(br); + if (vlc<0 || vlc>7) return false; + + if (sps->chroma_format_idc != 0) { + vlc = get_svlc(br); + vlc += shdr->luma_log2_weight_denom; + if (vlc<0 || vlc>7) return false; + shdr->ChromaLog2WeightDenom = vlc; + } + + int sumWeightFlags = 0; + + for (int l=0;l<=1;l++) + if (l==0 || (l==1 && shdr->slice_type == SLICE_TYPE_B)) + { + int num_ref = (l==0 ? shdr->num_ref_idx_l0_active-1 : shdr->num_ref_idx_l1_active-1); + + for (int i=0;i<=num_ref;i++) { + shdr->luma_weight_flag[l][i] = get_bits(br,1); + if (shdr->luma_weight_flag[l][i]) sumWeightFlags++; + } + + if (sps->chroma_format_idc != 0) { + for (int i=0;i<=num_ref;i++) { + shdr->chroma_weight_flag[l][i] = get_bits(br,1); + if (shdr->chroma_weight_flag[l][i]) sumWeightFlags+=2; + } + } + + for (int i=0;i<=num_ref;i++) { + if (shdr->luma_weight_flag[l][i]) { + + // delta_luma_weight + + vlc = get_svlc(br); + if (vlc < -128 || vlc > 127) return false; + + shdr->LumaWeight[l][i] = (1<luma_log2_weight_denom) + vlc; + + // luma_offset + + vlc = get_svlc(br); + if (vlc < -sps->WpOffsetHalfRangeY || vlc > sps->WpOffsetHalfRangeY-1) return false; + shdr->luma_offset[l][i] = vlc; + } + else { + shdr->LumaWeight[l][i] = 1<luma_log2_weight_denom; + shdr->luma_offset[l][i] = 0; + } + + if (shdr->chroma_weight_flag[l][i]) + for (int j=0;j<2;j++) { + // delta_chroma_weight + + vlc = get_svlc(br); + if (vlc < -128 || vlc > 127) return false; + + shdr->ChromaWeight[l][i][j] = (1<ChromaLog2WeightDenom) + vlc; + + // delta_chroma_offset + + vlc = get_svlc(br); + if (vlc < -4*sps->WpOffsetHalfRangeC || + vlc > 4*sps->WpOffsetHalfRangeC-1) return false; + + vlc = Clip3(-sps->WpOffsetHalfRangeC, + sps->WpOffsetHalfRangeC-1, + (sps->WpOffsetHalfRangeC + +vlc + -((sps->WpOffsetHalfRangeC*shdr->ChromaWeight[l][i][j]) + >> shdr->ChromaLog2WeightDenom))); + + shdr->ChromaOffset[l][i][j] = vlc; + } + else { + for (int j=0;j<2;j++) { + shdr->ChromaWeight[l][i][j] = 1<ChromaLog2WeightDenom; + shdr->ChromaOffset[l][i][j] = 0; + } + } + } + } + + // TODO: bitstream conformance requires that 'sumWeightFlags<=24' + + return true; +} + + +void slice_segment_header::reset() +{ + pps = NULL; + + slice_index = 0; + + first_slice_segment_in_pic_flag = 0; + no_output_of_prior_pics_flag = 0; + slice_pic_parameter_set_id = 0; + dependent_slice_segment_flag = 0; + slice_segment_address = 0; + + slice_type = 0; + pic_output_flag = 0; + colour_plane_id = 0; + slice_pic_order_cnt_lsb = 0; + short_term_ref_pic_set_sps_flag = 0; + slice_ref_pic_set.reset(); + + short_term_ref_pic_set_idx = 0; + num_long_term_sps = 0; + num_long_term_pics= 0; + + for (int i=0;iget_RapPicFlag()) { // TODO: is this still correct ? Should we drop RapPicFlag ? + no_output_of_prior_pics_flag = get_bits(br,1); + } + + slice_pic_parameter_set_id = get_uvlc(br); + if (slice_pic_parameter_set_id > DE265_MAX_PPS_SETS || + slice_pic_parameter_set_id == UVLC_ERROR) { + ctx->add_warning(DE265_WARNING_NONEXISTING_PPS_REFERENCED, false); + return DE265_OK; + } + + if (!ctx->has_pps(slice_pic_parameter_set_id)) { + ctx->add_warning(DE265_WARNING_NONEXISTING_PPS_REFERENCED, false); + return DE265_OK; + } + + pps = ctx->get_shared_pps(slice_pic_parameter_set_id); + + const seq_parameter_set* sps = pps->sps.get(); + if (!sps->sps_read) { + ctx->add_warning(DE265_WARNING_NONEXISTING_SPS_REFERENCED, false); + *continueDecoding = false; + return DE265_OK; + } + + if (!first_slice_segment_in_pic_flag) { + if (pps->dependent_slice_segments_enabled_flag) { + dependent_slice_segment_flag = get_bits(br,1); + } else { + dependent_slice_segment_flag = 0; + } + + int slice_segment_address = get_bits(br, ceil_log2(sps->PicSizeInCtbsY)); + + if (dependent_slice_segment_flag) { + if (slice_segment_address == 0) { + *continueDecoding = false; + ctx->add_warning(DE265_WARNING_DEPENDENT_SLICE_WITH_ADDRESS_ZERO, false); + return DE265_OK; + } + + if (ctx->previous_slice_header == NULL) { + return DE265_ERROR_NO_INITIAL_SLICE_HEADER; + } + + *this = *ctx->previous_slice_header; + + first_slice_segment_in_pic_flag = 0; + dependent_slice_segment_flag = 1; + } + + this->slice_segment_address = slice_segment_address; + } else { + dependent_slice_segment_flag = 0; + slice_segment_address = 0; + } + + if (slice_segment_address < 0 || + slice_segment_address >= sps->PicSizeInCtbsY) { + ctx->add_warning(DE265_WARNING_SLICE_SEGMENT_ADDRESS_INVALID, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + + //printf("SLICE %d (%d)\n",slice_segment_address, sps->PicSizeInCtbsY); + + + if (!dependent_slice_segment_flag) { + for (int i=0; inum_extra_slice_header_bits; i++) { + //slice_reserved_undetermined_flag[i] + skip_bits(br,1); + } + + slice_type = get_uvlc(br); + if (slice_type > 2 || + slice_type == UVLC_ERROR) { + ctx->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false); + *continueDecoding = false; + return DE265_OK; + } + + if (pps->output_flag_present_flag) { + pic_output_flag = get_bits(br,1); + } + else { + pic_output_flag = 1; + } + + if (sps->separate_colour_plane_flag == 1) { + colour_plane_id = get_bits(br,2); + } + + + slice_pic_order_cnt_lsb = 0; + short_term_ref_pic_set_sps_flag = 0; + + int NumLtPics = 0; + + if (ctx->get_nal_unit_type() != NAL_UNIT_IDR_W_RADL && + ctx->get_nal_unit_type() != NAL_UNIT_IDR_N_LP) { + slice_pic_order_cnt_lsb = get_bits(br, sps->log2_max_pic_order_cnt_lsb); + short_term_ref_pic_set_sps_flag = get_bits(br,1); + + if (!short_term_ref_pic_set_sps_flag) { + read_short_term_ref_pic_set(ctx, sps, + br, &slice_ref_pic_set, + sps->num_short_term_ref_pic_sets(), + sps->ref_pic_sets, + true); + + CurrRpsIdx = sps->num_short_term_ref_pic_sets(); + CurrRps = slice_ref_pic_set; + } + else { + int nBits = ceil_log2(sps->num_short_term_ref_pic_sets()); + if (nBits>0) short_term_ref_pic_set_idx = get_bits(br,nBits); + else short_term_ref_pic_set_idx = 0; + + if (short_term_ref_pic_set_idx >= sps->num_short_term_ref_pic_sets()) { + ctx->add_warning(DE265_WARNING_SHORT_TERM_REF_PIC_SET_OUT_OF_RANGE, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + + CurrRpsIdx = short_term_ref_pic_set_idx; + CurrRps = sps->ref_pic_sets[CurrRpsIdx]; + } + + + // --- long-term MC --- + + if (sps->long_term_ref_pics_present_flag) { + if (sps->num_long_term_ref_pics_sps > 0) { + num_long_term_sps = get_uvlc(br); + if (num_long_term_sps == UVLC_ERROR) { + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + } + else { + num_long_term_sps = 0; + } + + num_long_term_pics= get_uvlc(br); + if (num_long_term_pics == UVLC_ERROR) { + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + + // check maximum number of reference frames + + if (num_long_term_sps + + num_long_term_pics + + CurrRps.NumNegativePics + + CurrRps.NumPositivePics + > sps->sps_max_dec_pic_buffering[sps->sps_max_sub_layers-1]) + { + ctx->add_warning(DE265_WARNING_MAX_NUM_REF_PICS_EXCEEDED, false); + *continueDecoding = false; + return DE265_OK; + } + + for (int i=0; inum_long_term_ref_pics_sps); + lt_idx_sps[i] = get_bits(br, nBits); + + // check that the referenced lt-reference really exists + + if (lt_idx_sps[i] >= sps->num_long_term_ref_pics_sps) { + ctx->add_warning(DE265_NON_EXISTING_LT_REFERENCE_CANDIDATE_IN_SLICE_HEADER, false); + *continueDecoding = false; + return DE265_OK; + } + + // delta_poc_msb_present_flag[i] = 0; // TODO ? + + ctx->PocLsbLt[i] = sps->lt_ref_pic_poc_lsb_sps[ lt_idx_sps[i] ]; + ctx->UsedByCurrPicLt[i] = sps->used_by_curr_pic_lt_sps_flag[ lt_idx_sps[i] ]; + } + else { + int nBits = sps->log2_max_pic_order_cnt_lsb; + poc_lsb_lt[i] = get_bits(br, nBits); + used_by_curr_pic_lt_flag[i] = get_bits(br,1); + + ctx->PocLsbLt[i] = poc_lsb_lt[i]; + ctx->UsedByCurrPicLt[i] = used_by_curr_pic_lt_flag[i]; + } + + if (ctx->UsedByCurrPicLt[i]) { + NumLtPics++; + } + + delta_poc_msb_present_flag[i] = get_bits(br,1); + if (delta_poc_msb_present_flag[i]) { + delta_poc_msb_cycle_lt[i] = get_uvlc(br); + if (delta_poc_msb_cycle_lt[i]==UVLC_ERROR) { + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + } + else { + delta_poc_msb_cycle_lt[i] = 0; + } + + if (i==0 || i==num_long_term_sps) { + ctx->DeltaPocMsbCycleLt[i] = delta_poc_msb_cycle_lt[i]; + } + else { + ctx->DeltaPocMsbCycleLt[i] = (delta_poc_msb_cycle_lt[i] + + ctx->DeltaPocMsbCycleLt[i-1]); + } + } + } + else { + num_long_term_sps = 0; + num_long_term_pics= 0; + } + + if (sps->sps_temporal_mvp_enabled_flag) { + slice_temporal_mvp_enabled_flag = get_bits(br,1); + } + else { + slice_temporal_mvp_enabled_flag = 0; + } + } + else { + slice_pic_order_cnt_lsb = 0; + num_long_term_sps = 0; + num_long_term_pics= 0; + } + + + // --- SAO --- + + if (sps->sample_adaptive_offset_enabled_flag) { + slice_sao_luma_flag = get_bits(br,1); + + if (sps->ChromaArrayType != CHROMA_MONO) { + slice_sao_chroma_flag = get_bits(br,1); + } + else { + slice_sao_chroma_flag = 0; + } + } + else { + slice_sao_luma_flag = 0; + slice_sao_chroma_flag = 0; + } + + num_ref_idx_l0_active = 0; + num_ref_idx_l1_active = 0; + + if (slice_type == SLICE_TYPE_P || + slice_type == SLICE_TYPE_B) { + num_ref_idx_active_override_flag = get_bits(br,1); + if (num_ref_idx_active_override_flag) { + num_ref_idx_l0_active = get_uvlc(br); + if (num_ref_idx_l0_active == UVLC_ERROR) { + ctx->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + num_ref_idx_l0_active++;; + + if (slice_type == SLICE_TYPE_B) { + num_ref_idx_l1_active = get_uvlc(br); + if (num_ref_idx_l1_active == UVLC_ERROR) { + ctx->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + num_ref_idx_l1_active++; + } + } + else { + num_ref_idx_l0_active = pps->num_ref_idx_l0_default_active; + num_ref_idx_l1_active = pps->num_ref_idx_l1_default_active; + } + + if (num_ref_idx_l0_active > 16) { return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } + if (num_ref_idx_l1_active > 16) { return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } + + NumPocTotalCurr = CurrRps.NumPocTotalCurr_shortterm_only + NumLtPics; + + if (pps->lists_modification_present_flag && NumPocTotalCurr > 1) { + + int nBits = ceil_log2(NumPocTotalCurr); + + ref_pic_list_modification_flag_l0 = get_bits(br,1); + if (ref_pic_list_modification_flag_l0) { + for (int i=0;icabac_init_present_flag) { + cabac_init_flag = get_bits(br,1); + } + else { + cabac_init_flag = 0; + } + + if (slice_temporal_mvp_enabled_flag) { + if (slice_type == SLICE_TYPE_B) + collocated_from_l0_flag = get_bits(br,1); + else + collocated_from_l0_flag = 1; + + if (( collocated_from_l0_flag && num_ref_idx_l0_active > 1) || + (!collocated_from_l0_flag && num_ref_idx_l1_active > 1)) { + collocated_ref_idx = get_uvlc(br); + if (collocated_ref_idx == UVLC_ERROR) { + ctx->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + } + else { + collocated_ref_idx = 0; + } + + // check whether collocated_ref_idx points to a valid index + + if (( collocated_from_l0_flag && collocated_ref_idx >= num_ref_idx_l0_active) || + (!collocated_from_l0_flag && collocated_ref_idx >= num_ref_idx_l1_active)) { + ctx->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + } + + + if ((pps->weighted_pred_flag && slice_type == SLICE_TYPE_P) || + (pps->weighted_bipred_flag && slice_type == SLICE_TYPE_B)) { + + if (!read_pred_weight_table(br,this,ctx)) + { + ctx->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + } + + five_minus_max_num_merge_cand = get_uvlc(br); + if (five_minus_max_num_merge_cand == UVLC_ERROR) { + ctx->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + MaxNumMergeCand = 5-five_minus_max_num_merge_cand; + } + + slice_qp_delta = get_svlc(br); + if (slice_qp_delta == UVLC_ERROR) { + ctx->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + //logtrace(LogSlice,"slice_qp_delta: %d\n",shdr->slice_qp_delta); + + if (pps->pps_slice_chroma_qp_offsets_present_flag) { + slice_cb_qp_offset = get_svlc(br); + if (slice_cb_qp_offset == UVLC_ERROR) { + ctx->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + + slice_cr_qp_offset = get_svlc(br); + if (slice_cr_qp_offset == UVLC_ERROR) { + ctx->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + } + else { + slice_cb_qp_offset = 0; + slice_cr_qp_offset = 0; + } + + if (pps->range_extension.chroma_qp_offset_list_enabled_flag) { + cu_chroma_qp_offset_enabled_flag = get_bits(br,1); + } + + if (pps->deblocking_filter_override_enabled_flag) { + deblocking_filter_override_flag = get_bits(br,1); + } + else { + deblocking_filter_override_flag = 0; + } + + slice_beta_offset = pps->beta_offset; + slice_tc_offset = pps->tc_offset; + + if (deblocking_filter_override_flag) { + slice_deblocking_filter_disabled_flag = get_bits(br,1); + if (!slice_deblocking_filter_disabled_flag) { + slice_beta_offset = get_svlc(br); + if (slice_beta_offset == UVLC_ERROR) { + ctx->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + slice_beta_offset *= 2; + + slice_tc_offset = get_svlc(br); + if (slice_tc_offset == UVLC_ERROR) { + ctx->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + slice_tc_offset *= 2; + } + } + else { + slice_deblocking_filter_disabled_flag = pps->pic_disable_deblocking_filter_flag; + } + + if (pps->pps_loop_filter_across_slices_enabled_flag && + (slice_sao_luma_flag || slice_sao_chroma_flag || + !slice_deblocking_filter_disabled_flag )) { + slice_loop_filter_across_slices_enabled_flag = get_bits(br,1); + } + else { + slice_loop_filter_across_slices_enabled_flag = + pps->pps_loop_filter_across_slices_enabled_flag; + } + } + + if (pps->tiles_enabled_flag || pps->entropy_coding_sync_enabled_flag ) { + num_entry_point_offsets = get_uvlc(br); + if (num_entry_point_offsets == UVLC_ERROR) { + ctx->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + + if (pps->entropy_coding_sync_enabled_flag) { + // check num_entry_points for valid range + + int firstCTBRow = slice_segment_address / sps->PicWidthInCtbsY; + int lastCTBRow = firstCTBRow + num_entry_point_offsets; + if (lastCTBRow >= sps->PicHeightInCtbsY) { + ctx->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + } + + if (pps->tiles_enabled_flag) { + if (num_entry_point_offsets > pps->num_tile_columns * pps->num_tile_rows) { + ctx->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + } + + entry_point_offset.resize( num_entry_point_offsets ); + + if (num_entry_point_offsets > 0) { + offset_len = get_uvlc(br); + if (offset_len == UVLC_ERROR) { + ctx->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + offset_len++; + + if (offset_len > 32) { + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + + for (int i=0; i0) { + entry_point_offset[i] += entry_point_offset[i-1]; + } + } + } + } + else { + num_entry_point_offsets = 0; + } + + if (pps->slice_segment_header_extension_present_flag) { + slice_segment_header_extension_length = get_uvlc(br); + if (slice_segment_header_extension_length == UVLC_ERROR || + slice_segment_header_extension_length > 1000) { // TODO: safety check against too large values + ctx->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + + for (int i=0; i DE265_MAX_PPS_SETS) { + errqueue->add_warning(DE265_WARNING_NONEXISTING_PPS_REFERENCED, false); + return DE265_OK; + } + out.write_uvlc(slice_pic_parameter_set_id); + + if (!first_slice_segment_in_pic_flag) { + if (pps->dependent_slice_segments_enabled_flag) { + out.write_bit(dependent_slice_segment_flag); + } + + out.write_bits(slice_segment_address, ceil_log2(sps->PicSizeInCtbsY)); + + if (dependent_slice_segment_flag) { + if (slice_segment_address == 0) { + errqueue->add_warning(DE265_WARNING_DEPENDENT_SLICE_WITH_ADDRESS_ZERO, false); + return DE265_OK; + } + } + } + + if (slice_segment_address < 0 || + slice_segment_address > sps->PicSizeInCtbsY) { + errqueue->add_warning(DE265_WARNING_SLICE_SEGMENT_ADDRESS_INVALID, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + + + + if (!dependent_slice_segment_flag) { + for (int i=0; inum_extra_slice_header_bits; i++) { + //slice_reserved_undetermined_flag[i] + out.skip_bits(1); + } + + if (slice_type > 2) { + errqueue->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false); + return DE265_OK; + } + out.write_uvlc(slice_type); + + if (pps->output_flag_present_flag) { + out.write_bit(pic_output_flag); + } + + if (sps->separate_colour_plane_flag == 1) { + out.write_bits(colour_plane_id,2); + } + + + int NumLtPics = 0; + + if (nal_unit_type != NAL_UNIT_IDR_W_RADL && + nal_unit_type != NAL_UNIT_IDR_N_LP) { + out.write_bits(slice_pic_order_cnt_lsb, sps->log2_max_pic_order_cnt_lsb); + out.write_bit(short_term_ref_pic_set_sps_flag); + + if (!short_term_ref_pic_set_sps_flag) { + /* TODO + read_short_term_ref_pic_set(ctx, sps, + br, &slice_ref_pic_set, + sps->num_short_term_ref_pic_sets, + sps->ref_pic_sets, + true); + */ + //CurrRpsIdx = sps->num_short_term_ref_pic_sets; + //CurrRps = slice_ref_pic_set; + } + else { + int nBits = ceil_log2(sps->num_short_term_ref_pic_sets()); + if (nBits>0) out.write_bits(short_term_ref_pic_set_idx,nBits); + else { assert(short_term_ref_pic_set_idx==0); } + + if (short_term_ref_pic_set_idx > sps->num_short_term_ref_pic_sets()) { + errqueue->add_warning(DE265_WARNING_SHORT_TERM_REF_PIC_SET_OUT_OF_RANGE, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + + //CurrRpsIdx = short_term_ref_pic_set_idx; + //CurrRps = sps->ref_pic_sets[CurrRpsIdx]; + } + + + // --- long-term MC --- + + if (sps->long_term_ref_pics_present_flag) { + if (sps->num_long_term_ref_pics_sps > 0) { + out.write_uvlc(num_long_term_sps); + } + else { + assert(num_long_term_sps == 0); + } + + out.write_uvlc(num_long_term_pics); + + + // check maximum number of reference frames + + if (num_long_term_sps + + num_long_term_pics + + CurrRps.NumNegativePics + + CurrRps.NumPositivePics + > sps->sps_max_dec_pic_buffering[sps->sps_max_sub_layers-1]) + { + errqueue->add_warning(DE265_WARNING_MAX_NUM_REF_PICS_EXCEEDED, false); + return DE265_OK; + } + + for (int i=0; inum_long_term_ref_pics_sps); + out.write_bits(lt_idx_sps[i], nBits); + + // check that the referenced lt-reference really exists + + if (lt_idx_sps[i] >= sps->num_long_term_ref_pics_sps) { + errqueue->add_warning(DE265_NON_EXISTING_LT_REFERENCE_CANDIDATE_IN_SLICE_HEADER, false); + return DE265_OK; + } + + //ctx->PocLsbLt[i] = sps->lt_ref_pic_poc_lsb_sps[ lt_idx_sps[i] ]; + //ctx->UsedByCurrPicLt[i] = sps->used_by_curr_pic_lt_sps_flag[ lt_idx_sps[i] ]; + } + else { + int nBits = sps->log2_max_pic_order_cnt_lsb; + out.write_bits(poc_lsb_lt[i], nBits); + out.write_bit(used_by_curr_pic_lt_flag[i]); + + //ctx->PocLsbLt[i] = poc_lsb_lt[i]; + //ctx->UsedByCurrPicLt[i] = used_by_curr_pic_lt_flag[i]; + } + + //if (ctx->UsedByCurrPicLt[i]) { + //NumLtPics++; + //} + + out.write_bit(delta_poc_msb_present_flag[i]); + if (delta_poc_msb_present_flag[i]) { + out.write_uvlc(delta_poc_msb_cycle_lt[i]); + } + else { + assert(delta_poc_msb_cycle_lt[i] == 0); + } + + /* + if (i==0 || i==num_long_term_sps) { + ctx->DeltaPocMsbCycleLt[i] = delta_poc_msb_cycle_lt[i]; + } + else { + ctx->DeltaPocMsbCycleLt[i] = (delta_poc_msb_cycle_lt[i] + + ctx->DeltaPocMsbCycleLt[i-1]); + } + */ + } + } + else { + assert(num_long_term_sps == 0); + assert(num_long_term_pics== 0); + } + + if (sps->sps_temporal_mvp_enabled_flag) { + out.write_bit(slice_temporal_mvp_enabled_flag); + } + else { + assert(slice_temporal_mvp_enabled_flag == 0); + } + } + else { + assert(slice_pic_order_cnt_lsb == 0); + assert(num_long_term_sps == 0); + assert(num_long_term_pics== 0); + } + + + // --- SAO --- + + if (sps->sample_adaptive_offset_enabled_flag) { + out.write_bit(slice_sao_luma_flag); + out.write_bit(slice_sao_chroma_flag); + } + else { + assert(slice_sao_luma_flag == 0); + assert(slice_sao_chroma_flag== 0); + } + + if (slice_type == SLICE_TYPE_P || + slice_type == SLICE_TYPE_B) { + out.write_bit(num_ref_idx_active_override_flag); + + if (num_ref_idx_active_override_flag) { + out.write_uvlc(num_ref_idx_l0_active); + num_ref_idx_l0_active++;; + + if (slice_type == SLICE_TYPE_B) { + out.write_uvlc(num_ref_idx_l1_active); + num_ref_idx_l1_active++; + } + } + else { + assert(num_ref_idx_l0_active == pps->num_ref_idx_l0_default_active); + assert(num_ref_idx_l1_active == pps->num_ref_idx_l1_default_active); + } + + NumPocTotalCurr = CurrRps.NumPocTotalCurr_shortterm_only + NumLtPics; + + if (pps->lists_modification_present_flag && NumPocTotalCurr > 1) { + + int nBits = ceil_log2(NumPocTotalCurr); + + out.write_bit(ref_pic_list_modification_flag_l0); + if (ref_pic_list_modification_flag_l0) { + for (int i=0;icabac_init_present_flag) { + out.write_bit(cabac_init_flag); + } + else { + assert(cabac_init_flag == 0); + } + + if (slice_temporal_mvp_enabled_flag) { + if (slice_type == SLICE_TYPE_B) + out.write_bit(collocated_from_l0_flag); + else + { assert(collocated_from_l0_flag == 1); } + + if (( collocated_from_l0_flag && num_ref_idx_l0_active > 1) || + (!collocated_from_l0_flag && num_ref_idx_l1_active > 1)) { + out.write_uvlc(collocated_ref_idx); + } + else { + assert(collocated_ref_idx == 0); + } + } + + if ((pps->weighted_pred_flag && slice_type == SLICE_TYPE_P) || + (pps->weighted_bipred_flag && slice_type == SLICE_TYPE_B)) { + + assert(0); + /* TODO + if (!read_pred_weight_table(br,this,ctx)) + { + ctx->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + */ + } + + out.write_uvlc(five_minus_max_num_merge_cand); + //MaxNumMergeCand = 5-five_minus_max_num_merge_cand; + } + + out.write_svlc(slice_qp_delta); + + if (pps->pps_slice_chroma_qp_offsets_present_flag) { + out.write_svlc(slice_cb_qp_offset); + out.write_svlc(slice_cr_qp_offset); + } + else { + assert(slice_cb_qp_offset == 0); + assert(slice_cr_qp_offset == 0); + } + + if (pps->deblocking_filter_override_enabled_flag) { + out.write_bit(deblocking_filter_override_flag); + } + else { + assert(deblocking_filter_override_flag == 0); + } + + //slice_beta_offset = pps->beta_offset; + //slice_tc_offset = pps->tc_offset; + + if (deblocking_filter_override_flag) { + out.write_bit(slice_deblocking_filter_disabled_flag); + if (!slice_deblocking_filter_disabled_flag) { + out.write_svlc(slice_beta_offset/2); + out.write_svlc(slice_tc_offset /2); + } + } + else { + assert(slice_deblocking_filter_disabled_flag == pps->pic_disable_deblocking_filter_flag); + } + + if (pps->pps_loop_filter_across_slices_enabled_flag && + (slice_sao_luma_flag || slice_sao_chroma_flag || + !slice_deblocking_filter_disabled_flag )) { + out.write_bit(slice_loop_filter_across_slices_enabled_flag); + } + else { + assert(slice_loop_filter_across_slices_enabled_flag == + pps->pps_loop_filter_across_slices_enabled_flag); + } + } + + if (pps->tiles_enabled_flag || pps->entropy_coding_sync_enabled_flag ) { + out.write_uvlc(num_entry_point_offsets); + + if (num_entry_point_offsets > 0) { + out.write_uvlc(offset_len-1); + + for (int i=0; i0) prev = entry_point_offset[i-1]; + out.write_bits(entry_point_offset[i]-prev-1, offset_len); + } + } + } + } + else { + assert(num_entry_point_offsets == 0); + } + + if (pps->slice_segment_header_extension_present_flag) { + out.write_uvlc(slice_segment_header_extension_length); + if (slice_segment_header_extension_length > 1000) { // TODO: safety check against too large values + errqueue->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + + for (int i=0; ipic_init_qp + slice_qp_delta; + + switch (slice_type) + { + case SLICE_TYPE_I: initType = 0; break; + case SLICE_TYPE_P: initType = cabac_init_flag + 1; break; + case SLICE_TYPE_B: initType = 2 - cabac_init_flag; break; + } + + MaxNumMergeCand = 5-five_minus_max_num_merge_cand; +} + + +//----------------------------------------------------------------------- + + +void slice_segment_header::dump_slice_segment_header(const decoder_context* ctx, int fd) const +{ + FILE* fh; + if (fd==1) fh=stdout; + else if (fd==2) fh=stderr; + else { return; } + +#define LOG0(t) log2fh(fh, t) +#define LOG1(t,d) log2fh(fh, t,d) +#define LOG2(t,d1,d2) log2fh(fh, t,d1,d2) +#define LOG3(t,d1,d2,d3) log2fh(fh, t,d1,d2,d3) +#define LOG4(t,d1,d2,d3,d4) log2fh(fh, t,d1,d2,d3,d4) + + const pic_parameter_set* pps = ctx->get_pps(slice_pic_parameter_set_id); + assert(pps->pps_read); // TODO: error handling + + const seq_parameter_set* sps = ctx->get_sps((int)pps->seq_parameter_set_id); + assert(sps->sps_read); // TODO: error handling + + + LOG0("----------------- SLICE -----------------\n"); + LOG1("first_slice_segment_in_pic_flag : %d\n", first_slice_segment_in_pic_flag); + if (ctx->get_nal_unit_type() >= NAL_UNIT_BLA_W_LP && + ctx->get_nal_unit_type() <= NAL_UNIT_RESERVED_IRAP_VCL23) { + LOG1("no_output_of_prior_pics_flag : %d\n", no_output_of_prior_pics_flag); + } + + LOG1("slice_pic_parameter_set_id : %d\n", slice_pic_parameter_set_id); + + if (!first_slice_segment_in_pic_flag) { + //if (pps->dependent_slice_segments_enabled_flag) { + LOG1("dependent_slice_segment_flag : %d\n", dependent_slice_segment_flag); + //} + LOG1("slice_segment_address : %d\n", slice_segment_address); + } + + //if (!dependent_slice_segment_flag) + { + //for (int i=0; inum_extra_slice_header_bits; i++) { + //slice_reserved_flag[i] + + LOG1("slice_type : %c\n", + slice_type == 0 ? 'B' : + slice_type == 1 ? 'P' : 'I'); + + if (pps->output_flag_present_flag) { + LOG1("pic_output_flag : %d\n", pic_output_flag); + } + + if (sps->separate_colour_plane_flag == 1) { + LOG1("colour_plane_id : %d\n", colour_plane_id); + } + + LOG1("slice_pic_order_cnt_lsb : %d\n", slice_pic_order_cnt_lsb); + + if (ctx->get_nal_unit_type() != NAL_UNIT_IDR_W_RADL && + ctx->get_nal_unit_type() != NAL_UNIT_IDR_N_LP) { + LOG1("short_term_ref_pic_set_sps_flag : %d\n", short_term_ref_pic_set_sps_flag); + + if (!short_term_ref_pic_set_sps_flag) { + LOG1("ref_pic_set[ %2d ]: ",sps->num_short_term_ref_pic_sets()); + dump_compact_short_term_ref_pic_set(&slice_ref_pic_set, 16, fh); + } + else if (sps->num_short_term_ref_pic_sets() > 1) { + LOG1("short_term_ref_pic_set_idx : %d\n", short_term_ref_pic_set_idx); + dump_compact_short_term_ref_pic_set(&sps->ref_pic_sets[short_term_ref_pic_set_idx], 16, fh); + } + + if (sps->long_term_ref_pics_present_flag) { + if (sps->num_long_term_ref_pics_sps > 0) { + LOG1("num_long_term_sps : %d\n", num_long_term_sps); + } + + LOG1("num_long_term_pics : %d\n", num_long_term_pics); + +#if 0 + for (int i=0; iPocLsbLt[i]); + LOG2("UsedByCurrPicLt[%d] : %d\n", i, ctx->UsedByCurrPicLt[i]); + LOG2("DeltaPocMsbCycleLt[%d] : %d\n", i, ctx->DeltaPocMsbCycleLt[i]); + } +#endif + } + + if (sps->sps_temporal_mvp_enabled_flag) { + LOG1("slice_temporal_mvp_enabled_flag : %d\n", slice_temporal_mvp_enabled_flag); + } + } + + + if (sps->sample_adaptive_offset_enabled_flag) { + LOG1("slice_sao_luma_flag : %d\n", slice_sao_luma_flag); + LOG1("slice_sao_chroma_flag : %d\n", slice_sao_chroma_flag); + } + + + if (slice_type == SLICE_TYPE_P || slice_type == SLICE_TYPE_B) { + LOG1("num_ref_idx_active_override_flag : %d\n", num_ref_idx_active_override_flag); + + LOG2("num_ref_idx_l0_active : %d %s\n", num_ref_idx_l0_active, + num_ref_idx_active_override_flag ? "" : "(from PPS)"); + + if (slice_type == SLICE_TYPE_B) { + LOG2("num_ref_idx_l1_active : %d %s\n", num_ref_idx_l1_active, + num_ref_idx_active_override_flag ? "" : "(from PPS)"); + } + + if (pps->lists_modification_present_flag && NumPocTotalCurr > 1) + { + LOG1("ref_pic_list_modification_flag_l0 : %d\n", ref_pic_list_modification_flag_l0); + if (ref_pic_list_modification_flag_l0) { + for (int i=0;iweighted_pred_flag && slice_type == SLICE_TYPE_P) || + (pps->weighted_bipred_flag && slice_type == SLICE_TYPE_B)) + { + LOG1("luma_log2_weight_denom : %d\n", luma_log2_weight_denom); + if (sps->chroma_format_idc != 0) { + LOG1("ChromaLog2WeightDenom : %d\n", ChromaLog2WeightDenom); + } + + for (int l=0;l<=1;l++) + if (l==0 || (l==1 && slice_type == SLICE_TYPE_B)) + { + int num_ref = (l==0 ? + num_ref_idx_l0_active-1 : + num_ref_idx_l1_active-1); + + if (false) { // do not show these flags + for (int i=0;i<=num_ref;i++) { + LOG3("luma_weight_flag_l%d[%d] : %d\n",l,i,luma_weight_flag[l][i]); + } + + if (sps->chroma_format_idc != 0) { + for (int i=0;i<=num_ref;i++) { + LOG3("chroma_weight_flag_l%d[%d] : %d\n",l,i,chroma_weight_flag[l][i]); + } + } + } + + for (int i=0;i<=num_ref;i++) { + LOG3("LumaWeight_L%d[%d] : %d\n",l,i,LumaWeight[l][i]); + LOG3("luma_offset_l%d[%d] : %d\n",l,i,luma_offset[l][i]); + + for (int j=0;j<2;j++) { + LOG4("ChromaWeight_L%d[%d][%d] : %d\n",l,i,j,ChromaWeight[l][i][j]); + LOG4("ChromaOffset_L%d[%d][%d] : %d\n",l,i,j,ChromaOffset[l][i][j]); + } + } + } + } + + LOG1("five_minus_max_num_merge_cand : %d\n", five_minus_max_num_merge_cand); + } + + + LOG1("slice_qp_delta : %d\n", slice_qp_delta); + if (pps->pps_slice_chroma_qp_offsets_present_flag) { + LOG1("slice_cb_qp_offset : %d\n", slice_cb_qp_offset); + LOG1("slice_cr_qp_offset : %d\n", slice_cr_qp_offset); + } + + if (pps->deblocking_filter_override_enabled_flag) { + LOG1("deblocking_filter_override_flag : %d\n", deblocking_filter_override_flag); + } + + LOG2("slice_deblocking_filter_disabled_flag : %d %s\n", + slice_deblocking_filter_disabled_flag, + (deblocking_filter_override_flag ? "(override)" : "(from pps)")); + + if (deblocking_filter_override_flag) { + + if (!slice_deblocking_filter_disabled_flag) { + LOG1("slice_beta_offset : %d\n", slice_beta_offset); + LOG1("slice_tc_offset : %d\n", slice_tc_offset); + } + } + + if (pps->pps_loop_filter_across_slices_enabled_flag && + (slice_sao_luma_flag || slice_sao_chroma_flag || + !slice_deblocking_filter_disabled_flag)) { + LOG1("slice_loop_filter_across_slices_enabled_flag : %d\n", + slice_loop_filter_across_slices_enabled_flag); + } + } + + if (pps->tiles_enabled_flag || pps->entropy_coding_sync_enabled_flag) { + LOG1("num_entry_point_offsets : %d\n", num_entry_point_offsets); + + if (num_entry_point_offsets > 0) { + LOG1("offset_len : %d\n", offset_len); + + for (int i=0; ishdr->SliceQPY; + const int initType = tctx->shdr->initType; + assert(initType >= 0 && initType <= 2); + + tctx->ctx_model.init(initType, QPY); + + for (int i=0;i<4;i++) { + tctx->StatCoeff[i] = 0; + } +} + + + +static int decode_transform_skip_flag(thread_context* tctx, int cIdx) +{ + const int context = (cIdx==0) ? 0 : 1; + + logtrace(LogSlice,"# transform_skip_flag (context=%d)\n",context); + + int bit = decode_CABAC_bit(&tctx->cabac_decoder, + &tctx->ctx_model[CONTEXT_MODEL_TRANSFORM_SKIP_FLAG+context]); + + logtrace(LogSymbols,"$1 transform_skip_flag=%d\n",bit); + + return bit; +} + + +static int decode_sao_merge_flag(thread_context* tctx) +{ + logtrace(LogSlice,"# sao_merge_left/up_flag\n"); + int bit = decode_CABAC_bit(&tctx->cabac_decoder, + &tctx->ctx_model[CONTEXT_MODEL_SAO_MERGE_FLAG]); + + logtrace(LogSymbols,"$1 sao_merge_flag=%d\n",bit); + + return bit; +} + + + +static int decode_sao_type_idx(thread_context* tctx) +{ + logtrace(LogSlice,"# sao_type_idx_luma/chroma\n"); + + int bit0 = decode_CABAC_bit(&tctx->cabac_decoder, + &tctx->ctx_model[CONTEXT_MODEL_SAO_TYPE_IDX]); + + if (bit0==0) { + logtrace(LogSymbols,"$1 sao_type_idx=%d\n",0); + return 0; + } + else { + int bit1 = decode_CABAC_bypass(&tctx->cabac_decoder); + if (bit1==0) { + logtrace(LogSymbols,"$1 sao_type_idx=%d\n",1); + return 1; + } + else { + logtrace(LogSymbols,"$1 sao_type_idx=%d\n",2); + return 2; + } + } +} + + +static int decode_sao_offset_abs(thread_context* tctx, int bitDepth) +{ + logtrace(LogSlice,"# sao_offset_abs\n"); + int cMax = (1<<(libde265_min(bitDepth,10)-5))-1; + int value = decode_CABAC_TU_bypass(&tctx->cabac_decoder, cMax); + logtrace(LogSymbols,"$1 sao_offset_abs=%d\n",value); + return value; +} + + +static int decode_sao_class(thread_context* tctx) +{ + logtrace(LogSlice,"# sao_class\n"); + int value = decode_CABAC_FL_bypass(&tctx->cabac_decoder, 2); + logtrace(LogSymbols,"$1 sao_class=%d\n",value); + return value; +} + + +static int decode_sao_offset_sign(thread_context* tctx) +{ + logtrace(LogSlice,"# sao_offset_sign\n"); + int value = decode_CABAC_bypass(&tctx->cabac_decoder); + logtrace(LogSymbols,"$1 sao_offset_sign=%d\n",value); + return value; +} + + +static int decode_sao_band_position(thread_context* tctx) +{ + logtrace(LogSlice,"# sao_band_position\n"); + int value = decode_CABAC_FL_bypass(&tctx->cabac_decoder,5); + logtrace(LogSymbols,"$1 sao_band_position=%d\n",value); + return value; +} + + +static int decode_transquant_bypass_flag(thread_context* tctx) +{ + logtrace(LogSlice,"# cu_transquant_bypass_enable_flag\n"); + int value = decode_CABAC_bit(&tctx->cabac_decoder, + &tctx->ctx_model[CONTEXT_MODEL_CU_TRANSQUANT_BYPASS_FLAG]); + logtrace(LogSymbols,"$1 transquant_bypass_flag=%d\n",value); + return value; +} + + +#include +#include + +static int decode_split_cu_flag(thread_context* tctx, + int x0, int y0, int ctDepth) +{ + // check if neighbors are available + + int availableL = check_CTB_available(tctx->img, x0,y0, x0-1,y0); + int availableA = check_CTB_available(tctx->img, x0,y0, x0,y0-1); + + int condL = 0; + int condA = 0; + + if (availableL && tctx->img->get_ctDepth(x0-1,y0) > ctDepth) condL=1; + if (availableA && tctx->img->get_ctDepth(x0,y0-1) > ctDepth) condA=1; + + int contextOffset = condL + condA; + int context = contextOffset; + + // decode bit + + logtrace(LogSlice,"# split_cu_flag context=%d R=%x\n", context, tctx->cabac_decoder.range); + + int bit = decode_CABAC_bit(&tctx->cabac_decoder, &tctx->ctx_model[CONTEXT_MODEL_SPLIT_CU_FLAG + context]); + + logtrace(LogSlice,"> split_cu_flag R=%x, ctx=%d, bit=%d\n", tctx->cabac_decoder.range,context,bit); + + logtrace(LogSymbols,"$1 split_cu_flag=%d\n",bit); + + return bit; +} + + +static int decode_cu_skip_flag(thread_context* tctx, + int x0, int y0, int ctDepth) +{ + decoder_context* ctx = tctx->decctx; + + // check if neighbors are available + + int availableL = check_CTB_available(tctx->img, x0,y0, x0-1,y0); + int availableA = check_CTB_available(tctx->img, x0,y0, x0,y0-1); + + int condL = 0; + int condA = 0; + + if (availableL && tctx->img->get_cu_skip_flag(x0-1,y0)) condL=1; + if (availableA && tctx->img->get_cu_skip_flag(x0,y0-1)) condA=1; + + int contextOffset = condL + condA; + int context = contextOffset; + + // decode bit + + logtrace(LogSlice,"# cu_skip_flag context=%d R=%x\n", context, tctx->cabac_decoder.range); + + int bit = decode_CABAC_bit(&tctx->cabac_decoder, &tctx->ctx_model[CONTEXT_MODEL_CU_SKIP_FLAG + context]); + + logtrace(LogSlice,"> cu_skip_flag R=%x, ctx=%d, bit=%d\n", tctx->cabac_decoder.range,context,bit); + + logtrace(LogSymbols,"$1 cu_skip_flag=%d\n",bit); + + return bit; +} + + +static enum PartMode decode_part_mode(thread_context* tctx, + enum PredMode pred_mode, int cLog2CbSize) +{ + de265_image* img = tctx->img; + + if (pred_mode == MODE_INTRA) { + logtrace(LogSlice,"# part_mode (INTRA)\n"); + + int bit = decode_CABAC_bit(&tctx->cabac_decoder, &tctx->ctx_model[CONTEXT_MODEL_PART_MODE]); + + logtrace(LogSlice,"> %s\n",bit ? "2Nx2N" : "NxN"); + + logtrace(LogSymbols,"$1 part_mode=%d\n",bit ? PART_2Nx2N : PART_NxN); + + return bit ? PART_2Nx2N : PART_NxN; + } + else { + const seq_parameter_set& sps = img->get_sps(); + + int bit0 = decode_CABAC_bit(&tctx->cabac_decoder, &tctx->ctx_model[CONTEXT_MODEL_PART_MODE+0]); + if (bit0) { logtrace(LogSymbols,"$1 part_mode=%d\n",PART_2Nx2N); return PART_2Nx2N; } + + // CHECK_ME: I optimize code and fix bug here, need more VERIFY! + int bit1 = decode_CABAC_bit(&tctx->cabac_decoder, &tctx->ctx_model[CONTEXT_MODEL_PART_MODE+1]); + if (cLog2CbSize > sps.Log2MinCbSizeY) { + if (!sps.amp_enabled_flag) { + logtrace(LogSymbols,"$1 part_mode=%d\n",bit1 ? PART_2NxN : PART_Nx2N); + return bit1 ? PART_2NxN : PART_Nx2N; + } + else { + int bit3 = decode_CABAC_bit(&tctx->cabac_decoder, &tctx->ctx_model[CONTEXT_MODEL_PART_MODE+3]); + if (bit3) { + logtrace(LogSymbols,"$1 part_mode=%d\n",bit1 ? PART_2NxN : PART_Nx2N); + return bit1 ? PART_2NxN : PART_Nx2N; + } + + int bit4 = decode_CABAC_bypass(&tctx->cabac_decoder); + if ( bit1 && bit4) { + logtrace(LogSymbols,"$1 part_mode=%d\n",PART_2NxnD); + return PART_2NxnD; + } + if ( bit1 && !bit4) { + logtrace(LogSymbols,"$1 part_mode=%d\n",PART_2NxnU); + return PART_2NxnU; + } + if (!bit1 && !bit4) { + logtrace(LogSymbols,"$1 part_mode=%d\n",PART_nLx2N); + return PART_nLx2N; + } + if (!bit1 && bit4) { + logtrace(LogSymbols,"$1 part_mode=%d\n",PART_nRx2N); + return PART_nRx2N; + } + } + } + else { + // TODO, we could save one if here when first decoding the next bin and then + // checkcLog2CbSize==3 when it is '0' + + if (bit1) { + logtrace(LogSymbols,"$1 part_mode=%d\n",PART_2NxN); + return PART_2NxN; + } + + if (cLog2CbSize==3) { + logtrace(LogSymbols,"$1 part_mode=%d\n",PART_Nx2N); + return PART_Nx2N; + } + else { + int bit2 = decode_CABAC_bit(&tctx->cabac_decoder, &tctx->ctx_model[CONTEXT_MODEL_PART_MODE+2]); + logtrace(LogSymbols,"$1 part_mode=%d\n",PART_NxN-bit2); + return (enum PartMode)((int)PART_NxN - bit2)/*bit2 ? PART_Nx2N : PART_NxN*/; + } + } + } + + assert(false); // should never be reached + return PART_2Nx2N; +} + + +static inline int decode_prev_intra_luma_pred_flag(thread_context* tctx) +{ + logtrace(LogSlice,"# prev_intra_luma_pred_flag\n"); + int bit = decode_CABAC_bit(&tctx->cabac_decoder, &tctx->ctx_model[CONTEXT_MODEL_PREV_INTRA_LUMA_PRED_FLAG]); + logtrace(LogSymbols,"$1 prev_intra_luma_pred_flag=%d\n",bit); + return bit; +} + + +static inline int decode_mpm_idx(thread_context* tctx) +{ + logtrace(LogSlice,"# mpm_idx (TU:2)\n"); + int mpm = decode_CABAC_TU_bypass(&tctx->cabac_decoder, 2); + logtrace(LogSlice,"> mpm_idx = %d\n",mpm); + logtrace(LogSymbols,"$1 mpm_idx=%d\n",mpm); + return mpm; +} + + +static inline int decode_rem_intra_luma_pred_mode(thread_context* tctx) +{ + logtrace(LogSlice,"# rem_intra_luma_pred_mode (5 bits)\n"); + int value = decode_CABAC_FL_bypass(&tctx->cabac_decoder, 5); + logtrace(LogSymbols,"$1 rem_intra_luma_pred_mode=%d\n",value); + return value; +} + + +static int decode_intra_chroma_pred_mode(thread_context* tctx) +{ + logtrace(LogSlice,"# intra_chroma_pred_mode\n"); + + int prefix = decode_CABAC_bit(&tctx->cabac_decoder, &tctx->ctx_model[CONTEXT_MODEL_INTRA_CHROMA_PRED_MODE]); + + int mode; + if (prefix==0) { + mode=4; + } + else { + mode = decode_CABAC_FL_bypass(&tctx->cabac_decoder, 2); + } + + logtrace(LogSlice,"> intra_chroma_pred_mode = %d\n",mode); + logtrace(LogSymbols,"$1 intra_chroma_pred_mode=%d\n",mode); + + return mode; +} + + +static int decode_split_transform_flag(thread_context* tctx, + int log2TrafoSize) +{ + logtrace(LogSlice,"# split_transform_flag (log2TrafoSize=%d)\n",log2TrafoSize); + + int context = 5-log2TrafoSize; + assert(context >= 0 && context <= 2); + + logtrace(LogSlice,"# context: %d\n",context); + + int bit = decode_CABAC_bit(&tctx->cabac_decoder, &tctx->ctx_model[CONTEXT_MODEL_SPLIT_TRANSFORM_FLAG + context]); + logtrace(LogSymbols,"$1 split_transform_flag=%d\n",bit); + return bit; +} + + +static int decode_cbf_chroma(thread_context* tctx, + int trafoDepth) +{ + logtrace(LogSlice,"# cbf_chroma\n"); + + int bit = decode_CABAC_bit(&tctx->cabac_decoder, &tctx->ctx_model[CONTEXT_MODEL_CBF_CHROMA + trafoDepth]); + + logtrace(LogSymbols,"$1 cbf_chroma=%d\n",bit); + return bit; +} + + +static int decode_cbf_luma(thread_context* tctx, + int trafoDepth) +{ + logtrace(LogSlice,"# cbf_luma\n"); + + int bit = decode_CABAC_bit(&tctx->cabac_decoder, &tctx->ctx_model[CONTEXT_MODEL_CBF_LUMA + (trafoDepth==0)]); + + logtrace(LogSlice,"> cbf_luma = %d\n",bit); + + logtrace(LogSymbols,"$1 cbf_luma=%d\n",bit); + return bit; +} + + +static inline int decode_coded_sub_block_flag(thread_context* tctx, + int cIdx, + uint8_t coded_sub_block_neighbors) +{ + logtrace(LogSlice,"# coded_sub_block_flag\n"); + + // tricky computation of csbfCtx + int csbfCtx = ((coded_sub_block_neighbors & 1) | // right neighbor set or + (coded_sub_block_neighbors >> 1)); // bottom neighbor set -> csbfCtx=1 + + int ctxIdxInc = csbfCtx; + if (cIdx!=0) { + ctxIdxInc += 2; + } + + int bit = decode_CABAC_bit(&tctx->cabac_decoder, + &tctx->ctx_model[CONTEXT_MODEL_CODED_SUB_BLOCK_FLAG + ctxIdxInc]); + + logtrace(LogSymbols,"$1 coded_sub_block_flag=%d\n",bit); + return bit; +} + + +static int decode_cu_qp_delta_abs(thread_context* tctx) +{ + logtrace(LogSlice,"# cu_qp_delta_abs\n"); + + int bit = decode_CABAC_bit(&tctx->cabac_decoder, + &tctx->ctx_model[CONTEXT_MODEL_CU_QP_DELTA_ABS + 0]); + if (bit==0) { + logtrace(LogSymbols,"$1 cu_qp_delta_abs=%d\n",0); + return 0; + } + + int prefix=1; + for (int i=0;i<4;i++) { + bit = decode_CABAC_bit(&tctx->cabac_decoder, + &tctx->ctx_model[CONTEXT_MODEL_CU_QP_DELTA_ABS + 1]); + if (bit==0) { break; } + else { prefix++; } + } + + if (prefix==5) { + int value = decode_CABAC_EGk_bypass(&tctx->cabac_decoder, 0); + logtrace(LogSymbols,"$1 cu_qp_delta_abs=%d\n",value+5); + return value + 5; + } + else { + logtrace(LogSymbols,"$1 cu_qp_delta_abs=%d\n",prefix); + return prefix; + } +} + + +static int decode_last_significant_coeff_prefix(thread_context* tctx, + int log2TrafoSize, + int cIdx, + context_model* model) +{ + logtrace(LogSlice,"# last_significant_coeff_prefix log2TrafoSize:%d cIdx:%d\n",log2TrafoSize,cIdx); + + int cMax = (log2TrafoSize<<1)-1; + + int ctxOffset, ctxShift; + if (cIdx==0) { + ctxOffset = 3*(log2TrafoSize-2) + ((log2TrafoSize-1)>>2); + ctxShift = (log2TrafoSize+1)>>2; + } + else { + ctxOffset = 15; + ctxShift = log2TrafoSize-2; + } + + int binIdx; + int value = cMax; + for (binIdx=0;binIdx> ctxShift); + + logtrace(LogSlice,"context: %d+%d\n",ctxOffset,ctxIdxInc); + + int bit = decode_CABAC_bit(&tctx->cabac_decoder, &model[ctxOffset + ctxIdxInc]); + if (bit==0) { + value=binIdx; + break; + } + } + + logtrace(LogSlice,"> last_significant_coeff_prefix: %d\n", value); + + return value; +} + + +static const uint8_t ctxIdxMap[16] = { + 0,1,4,5, + 2,3,4,5, + 6,6,8,8, + 7,7,8,99 +}; + +uint8_t* ctxIdxLookup[4 /* 4-log2-32 */][2 /* !!cIdx */][2 /* !!scanIdx */][4 /* prevCsbf */]; + +bool alloc_and_init_significant_coeff_ctxIdx_lookupTable() +{ + int tableSize = 4*4*(2) + 8*8*(2*2*4) + 16*16*(2*4) + 32*32*(2*4); + + uint8_t* p = (uint8_t*)malloc(tableSize); + if (p==NULL) { + return false; + } + + memset(p,0xFF,tableSize); // just for debugging + + + // --- Set pointers to memory areas. Note that some parameters share the same memory. --- + + // 4x4 + + for (int cIdx=0;cIdx<2;cIdx++) { + for (int scanIdx=0;scanIdx<2;scanIdx++) + for (int prevCsbf=0;prevCsbf<4;prevCsbf++) + ctxIdxLookup[0][cIdx][scanIdx][prevCsbf] = p; + + p += 4*4; + } + + // 8x8 + + for (int cIdx=0;cIdx<2;cIdx++) + for (int scanIdx=0;scanIdx<2;scanIdx++) + for (int prevCsbf=0;prevCsbf<4;prevCsbf++) { + ctxIdxLookup[1][cIdx][scanIdx][prevCsbf] = p; + p += 8*8; + } + + // 16x16 + + for (int cIdx=0;cIdx<2;cIdx++) + for (int prevCsbf=0;prevCsbf<4;prevCsbf++) { + for (int scanIdx=0;scanIdx<2;scanIdx++) { + ctxIdxLookup[2][cIdx][scanIdx][prevCsbf] = p; + } + + p += 16*16; + } + + // 32x32 + + for (int cIdx=0;cIdx<2;cIdx++) + for (int prevCsbf=0;prevCsbf<4;prevCsbf++) { + for (int scanIdx=0;scanIdx<2;scanIdx++) { + ctxIdxLookup[3][cIdx][scanIdx][prevCsbf] = p; + } + + p += 32*32; + } + + + // --- precompute ctxIdx tables --- + + for (int log2w=2; log2w<=5 ; log2w++) + for (int cIdx=0;cIdx<2;cIdx++) + for (int scanIdx=0;scanIdx<2;scanIdx++) + for (int prevCsbf=0;prevCsbf<4;prevCsbf++) + { + for (int yC=0;yC<(1<>2; + + int sigCtx; + + // if log2TrafoSize==2 + if (sbWidth==1) { + sigCtx = ctxIdxMap[(yC<<2) + xC]; + } + else if (xC+yC==0) { + sigCtx = 0; + } + else { + int xS = xC>>2; + int yS = yC>>2; + /* + int prevCsbf = 0; + + if (xS < sbWidth-1) { prevCsbf += coded_sub_block_flag[xS+1 +yS*sbWidth]; } + if (yS < sbWidth-1) { prevCsbf += coded_sub_block_flag[xS+(1+yS)*sbWidth]<<1; } + */ + int xP = xC & 3; + int yP = yC & 3; + + //logtrace(LogSlice,"posInSubset: %d,%d\n",xP,yP); + //logtrace(LogSlice,"prevCsbf: %d\n",prevCsbf); + + switch (prevCsbf) { + case 0: + sigCtx = (xP+yP>=3) ? 0 : (xP+yP>0) ? 1 : 2; + break; + case 1: + sigCtx = (yP==0) ? 2 : (yP==1) ? 1 : 0; + break; + case 2: + sigCtx = (xP==0) ? 2 : (xP==1) ? 1 : 0; + break; + default: + sigCtx = 2; + break; + } + + //logtrace(LogSlice,"a) sigCtx=%d\n",sigCtx); + + if (cIdx==0) { + if (xS+yS > 0) sigCtx+=3; + + //logtrace(LogSlice,"b) sigCtx=%d\n",sigCtx); + + // if log2TrafoSize==3 + if (sbWidth==2) { // 8x8 block + sigCtx += (scanIdx==0) ? 9 : 15; + } else { + sigCtx += 21; + } + + //logtrace(LogSlice,"c) sigCtx=%d\n",sigCtx); + } + else { + // if log2TrafoSize==3 + if (sbWidth==2) { // 8x8 block + sigCtx+=9; + } + else { + sigCtx+=12; + } + } + + } + + int ctxIdxInc; + if (cIdx==0) { ctxIdxInc=sigCtx; } + else { ctxIdxInc=27+sigCtx; } + + if (ctxIdxLookup[log2w-2][cIdx][scanIdx][prevCsbf][xC+(yC<>4]; + int x0 = S.x<<2; + int y0 = S.y<<2; + + int subX = ScanOrderPos[s & 0xF].x; + int subY = ScanOrderPos[s & 0xF].y; + int xC = x0 + subX; + int yC = y0 + subY; + + + int w = 1<>2; + + int sigCtx; + + // if log2TrafoSize==2 + if (sbWidth==1) { + sigCtx = ctxIdxMap[(yC<<2) + xC]; + } + else if (xC+yC==0) { + sigCtx = 0; + } + else { + int xS = xC>>2; + int yS = yC>>2; + /* + int prevCsbf = 0; + + if (xS < sbWidth-1) { prevCsbf += coded_sub_block_flag[xS+1 +yS*sbWidth]; } + if (yS < sbWidth-1) { prevCsbf += coded_sub_block_flag[xS+(1+yS)*sbWidth]<<1; } + */ + int xP = xC & 3; + int yP = yC & 3; + + logtrace(LogSlice,"posInSubset: %d,%d\n",xP,yP); + logtrace(LogSlice,"prevCsbf: %d\n",prevCsbf); + + //printf("%d | %d %d\n",prevCsbf,xP,yP); + + switch (prevCsbf) { + case 0: + //sigCtx = (xP+yP==0) ? 2 : (xP+yP<3) ? 1 : 0; + sigCtx = (xP+yP>=3) ? 0 : (xP+yP>0) ? 1 : 2; + break; + case 1: + sigCtx = (yP==0) ? 2 : (yP==1) ? 1 : 0; + break; + case 2: + sigCtx = (xP==0) ? 2 : (xP==1) ? 1 : 0; + break; + default: + sigCtx = 2; + break; + } + + logtrace(LogSlice,"a) sigCtx=%d\n",sigCtx); + + if (cIdx==0) { + if (xS+yS > 0) sigCtx+=3; + + logtrace(LogSlice,"b) sigCtx=%d\n",sigCtx); + + // if log2TrafoSize==3 + if (sbWidth==2) { // 8x8 block + sigCtx += (scanIdx==0) ? 9 : 15; + } else { + sigCtx += 21; + } + + logtrace(LogSlice,"c) sigCtx=%d\n",sigCtx); + } + else { + // if log2TrafoSize==3 + if (sbWidth==2) { // 8x8 block + sigCtx+=9; + } + else { + sigCtx+=12; + } + } + } + + int ctxIdxInc; + if (cIdx==0) { ctxIdxInc=sigCtx; } + else { ctxIdxInc=27+sigCtx; } + + + ctxIdxLookup[log2w-2][cIdx][scanIdx][prevCsbf][xC+(yC<>2; + int yS = yC>>2; + int prevCsbf = 0; + if (xS < sbWidth-1) { prevCsbf += coded_sub_block_flag[xS+1 +yS*sbWidth]; } + if (yS < sbWidth-1) { prevCsbf += coded_sub_block_flag[xS+(1+yS)*sbWidth]<<1; } + + int xP = xC & 3; + int yP = yC & 3; + + logtrace(LogSlice,"posInSubset: %d,%d\n",xP,yP); + logtrace(LogSlice,"prevCsbf: %d\n",prevCsbf); + + //printf("%d | %d %d\n",prevCsbf,xP,yP); + + switch (prevCsbf) { + case 0: + //sigCtx = (xP+yP==0) ? 2 : (xP+yP<3) ? 1 : 0; + sigCtx = (xP+yP>=3) ? 0 : (xP+yP>0) ? 1 : 2; + break; + case 1: + sigCtx = (yP==0) ? 2 : (yP==1) ? 1 : 0; + break; + case 2: + sigCtx = (xP==0) ? 2 : (xP==1) ? 1 : 0; + break; + default: + sigCtx = 2; + break; + } + + logtrace(LogSlice,"a) sigCtx=%d\n",sigCtx); + + if (cIdx==0) { + if (xS+yS > 0) sigCtx+=3; + + logtrace(LogSlice,"b) sigCtx=%d\n",sigCtx); + + // if log2TrafoSize==3 + if (sbWidth==2) { + sigCtx += (scanIdx==0) ? 9 : 15; + } else { + sigCtx += 21; + } + + logtrace(LogSlice,"c) sigCtx=%d\n",sigCtx); + } + else { + // if log2TrafoSize==3 + if (sbWidth==2) { + sigCtx+=9; + } + else { + sigCtx+=12; + } + } + } + + int ctxIdxInc; + if (cIdx==0) { ctxIdxInc=sigCtx; } + else { ctxIdxInc=27+sigCtx; } + + int context = tctx->shdr->initType*42 + ctxIdxInc; + logtrace(LogSlice,"context: %d\n",context); + + int bit = decode_CABAC_bit(&tctx->cabac_decoder, + &tctx->ctx_model[CONTEXT_MODEL_SIGNIFICANT_COEFF_FLAG + context]); + return bit; +} +#endif + + + +static inline int decode_significant_coeff_flag_lookup(thread_context* tctx, + uint8_t ctxIdxInc) +{ + logtrace(LogSlice,"# significant_coeff_flag\n"); + logtrace(LogSlice,"context: %d\n",ctxIdxInc); + + int bit = decode_CABAC_bit(&tctx->cabac_decoder, + &tctx->ctx_model[CONTEXT_MODEL_SIGNIFICANT_COEFF_FLAG + ctxIdxInc]); + + logtrace(LogSymbols,"$1 significant_coeff_flag=%d\n",bit); + + return bit; +} + + + + + +static inline int decode_coeff_abs_level_greater1(thread_context* tctx, + int cIdx, int i, + bool firstCoeffInSubblock, + bool firstSubblock, + int lastSubblock_greater1Ctx, + int* lastInvocation_greater1Ctx, + int* lastInvocation_coeff_abs_level_greater1_flag, + int* lastInvocation_ctxSet, int c1) +{ + logtrace(LogSlice,"# coeff_abs_level_greater1\n"); + + logtrace(LogSlice," cIdx:%d i:%d firstCoeffInSB:%d firstSB:%d lastSB>1:%d last>1Ctx:%d lastLev>1:%d lastCtxSet:%d\n", cIdx,i,firstCoeffInSubblock,firstSubblock,lastSubblock_greater1Ctx, + *lastInvocation_greater1Ctx, + *lastInvocation_coeff_abs_level_greater1_flag, + *lastInvocation_ctxSet); + + int lastGreater1Ctx; + int greater1Ctx; + int ctxSet; + + logtrace(LogSlice,"c1: %d\n",c1); + + if (firstCoeffInSubblock) { + // block with real DC -> ctx 0 + if (i==0 || cIdx>0) { ctxSet=0; } + else { ctxSet=2; } + + if (firstSubblock) { lastGreater1Ctx=1; } + else { lastGreater1Ctx = lastSubblock_greater1Ctx; } + + if (lastGreater1Ctx==0) { ctxSet++; } + + logtrace(LogSlice,"ctxSet: %d\n",ctxSet); + + greater1Ctx=1; + } + else { // !firstCoeffInSubblock + ctxSet = *lastInvocation_ctxSet; + logtrace(LogSlice,"ctxSet (old): %d\n",ctxSet); + + greater1Ctx = *lastInvocation_greater1Ctx; + if (greater1Ctx>0) { + int lastGreater1Flag=*lastInvocation_coeff_abs_level_greater1_flag; + if (lastGreater1Flag==1) greater1Ctx=0; + else { /*if (greater1Ctx>0)*/ greater1Ctx++; } + } + } + + ctxSet = c1; // use HM algo + + int ctxIdxInc = (ctxSet*4) + (greater1Ctx>=3 ? 3 : greater1Ctx); + + if (cIdx>0) { ctxIdxInc+=16; } + + int bit = decode_CABAC_bit(&tctx->cabac_decoder, + &tctx->ctx_model[CONTEXT_MODEL_COEFF_ABS_LEVEL_GREATER1_FLAG + ctxIdxInc]); + + *lastInvocation_greater1Ctx = greater1Ctx; + *lastInvocation_coeff_abs_level_greater1_flag = bit; + *lastInvocation_ctxSet = ctxSet; + + //logtrace(LogSymbols,"$1 coeff_abs_level_greater1=%d\n",bit); + + return bit; +} + + +static int decode_coeff_abs_level_greater2(thread_context* tctx, + int cIdx, // int i,int n, + int ctxSet) +{ + logtrace(LogSlice,"# coeff_abs_level_greater2\n"); + + int ctxIdxInc = ctxSet; + + if (cIdx>0) ctxIdxInc+=4; + + int bit = decode_CABAC_bit(&tctx->cabac_decoder, + &tctx->ctx_model[CONTEXT_MODEL_COEFF_ABS_LEVEL_GREATER2_FLAG + ctxIdxInc]); + + logtrace(LogSymbols,"$1 coeff_abs_level_greater2=%d\n",bit); + + return bit; +} + + +#define MAX_PREFIX 64 + +static int decode_coeff_abs_level_remaining(thread_context* tctx, + int cRiceParam) +{ + logtrace(LogSlice,"# decode_coeff_abs_level_remaining\n"); + + int prefix=-1; + int codeword=0; + do { + prefix++; + codeword = decode_CABAC_bypass(&tctx->cabac_decoder); + + if (prefix>MAX_PREFIX) { + return 0; // TODO: error + } + } + while (codeword); + + // prefix = nb. 1 bits + + int value; + + if (prefix <= 3) { + // when code only TR part (level < TRMax) + + codeword = decode_CABAC_FL_bypass(&tctx->cabac_decoder, cRiceParam); + value = (prefix<cabac_decoder, prefix-3+cRiceParam); + value = (((1<<(prefix-3))+3-1)<cabac_decoder, + &tctx->ctx_model[CONTEXT_MODEL_MERGE_FLAG]); + + logtrace(LogSymbols,"$1 merge_flag=%d\n",bit); + + return bit; +} + + +static int decode_merge_idx(thread_context* tctx) +{ + logtrace(LogSlice,"# merge_idx\n"); + + if (tctx->shdr->MaxNumMergeCand <= 1) { + logtrace(LogSymbols,"$1 merge_idx=%d\n",0); + return 0; + } + + // TU coding, first bin is CABAC, remaining are bypass. + // cMax = MaxNumMergeCand-1 + + int idx = decode_CABAC_bit(&tctx->cabac_decoder, + &tctx->ctx_model[CONTEXT_MODEL_MERGE_IDX]); + + if (idx==0) { + // nothing + } + else { + idx=1; + + while (idxshdr->MaxNumMergeCand-1) { + if (decode_CABAC_bypass(&tctx->cabac_decoder)) { + idx++; + } + else { + break; + } + } + } + + logtrace(LogSlice,"> merge_idx = %d\n",idx); + logtrace(LogSymbols,"$1 merge_idx=%d\n",idx); + + return idx; +} + + +static int decode_pred_mode_flag(thread_context* tctx) +{ + logtrace(LogSlice,"# pred_mode_flag\n"); + + int bit = decode_CABAC_bit(&tctx->cabac_decoder, + &tctx->ctx_model[CONTEXT_MODEL_PRED_MODE_FLAG]); + + logtrace(LogSymbols,"$1 pred_mode=%d\n",bit); + return bit; +} + +static int decode_mvp_lx_flag(thread_context* tctx) +{ + logtrace(LogSlice,"# mvp_lx_flag\n"); + + int bit = decode_CABAC_bit(&tctx->cabac_decoder, + &tctx->ctx_model[CONTEXT_MODEL_MVP_LX_FLAG]); + + logtrace(LogSymbols,"$1 mvp_lx_flag=%d\n",bit); + return bit; +} + +static int decode_rqt_root_cbf(thread_context* tctx) +{ + logtrace(LogSlice,"# rqt_root_cbf\n"); + + int bit = decode_CABAC_bit(&tctx->cabac_decoder, + &tctx->ctx_model[CONTEXT_MODEL_RQT_ROOT_CBF]); + + logtrace(LogSymbols,"$1 rqt_root_cbf=%d\n",bit); + return bit; +} + +static int decode_ref_idx_lX(thread_context* tctx, int numRefIdxLXActive) +{ + logtrace(LogSlice,"# ref_idx_lX\n"); + + int cMax = numRefIdxLXActive-1; + + if (cMax==0) { + logtrace(LogSlice,"> ref_idx = 0 (cMax==0)\n"); + return 0; + } // do check for single reference frame here + + int bit = decode_CABAC_bit(&tctx->cabac_decoder, + &tctx->ctx_model[CONTEXT_MODEL_REF_IDX_LX + 0]); + + int idx=0; + + while (bit) { + idx++; + if (idx==cMax) { break; } + + if (idx==1) { + bit = decode_CABAC_bit(&tctx->cabac_decoder, + &tctx->ctx_model[CONTEXT_MODEL_REF_IDX_LX + 1]); + } + else { + bit = decode_CABAC_bypass(&tctx->cabac_decoder); + } + } + + logtrace(LogSlice,"> ref_idx = %d\n",idx); + + logtrace(LogSymbols,"$1 ref_idx_lX=%d\n",idx); + return idx; +} + + +static enum InterPredIdc decode_inter_pred_idc(thread_context* tctx, + int x0, int y0, + int nPbW, int nPbH, + int ctDepth) +{ + logtrace(LogSlice,"# inter_pred_idc\n"); + + int value; + + context_model* model = &tctx->ctx_model[CONTEXT_MODEL_INTER_PRED_IDC]; + + if (nPbW+nPbH==12) { + value = decode_CABAC_bit(&tctx->cabac_decoder, + &model[4]); + } + else { + int bit0 = decode_CABAC_bit(&tctx->cabac_decoder, + &model[ctDepth]); + if (bit0==0) { + value = decode_CABAC_bit(&tctx->cabac_decoder, + &model[4]); + } + else { + value = 2; + } + } + + logtrace(LogSlice,"> inter_pred_idc = %d (%s)\n",value, + value==0 ? "L0" : (value==1 ? "L1" : "BI")); + + logtrace(LogSymbols,"$1 decode_inter_pred_idx=%d\n",value+1); + + return (enum InterPredIdc) (value+1); +} + + +static int decode_explicit_rdpcm_flag(thread_context* tctx,int cIdx) +{ + context_model* model = &tctx->ctx_model[CONTEXT_MODEL_RDPCM_FLAG]; + int value = decode_CABAC_bit(&tctx->cabac_decoder, &model[cIdx ? 1 : 0]); + return value; +} + + +static int decode_explicit_rdpcm_dir(thread_context* tctx,int cIdx) +{ + context_model* model = &tctx->ctx_model[CONTEXT_MODEL_RDPCM_DIR]; + int value = decode_CABAC_bit(&tctx->cabac_decoder, &model[cIdx ? 1 : 0]); + return value; +} + + + +/* Take CtbAddrInTS and compute + -> CtbAddrInRS, CtbX, CtbY + */ +bool setCtbAddrFromTS(thread_context* tctx) +{ + const seq_parameter_set& sps = tctx->img->get_sps(); + + if (tctx->CtbAddrInTS < sps.PicSizeInCtbsY) { + tctx->CtbAddrInRS = tctx->img->get_pps().CtbAddrTStoRS[tctx->CtbAddrInTS]; + + tctx->CtbX = tctx->CtbAddrInRS % sps.PicWidthInCtbsY; + tctx->CtbY = tctx->CtbAddrInRS / sps.PicWidthInCtbsY; + return false; + } + else { + tctx->CtbAddrInRS = sps.PicSizeInCtbsY; + + tctx->CtbX = tctx->CtbAddrInRS % sps.PicWidthInCtbsY; + tctx->CtbY = tctx->CtbAddrInRS / sps.PicWidthInCtbsY; + return true; + } +} + +// returns true when we reached the end of the image (ctbAddr==picSizeInCtbsY) +bool advanceCtbAddr(thread_context* tctx) +{ + tctx->CtbAddrInTS++; + + return setCtbAddrFromTS(tctx); +} + + +void read_sao(thread_context* tctx, int xCtb,int yCtb, + int CtbAddrInSliceSeg) +{ + slice_segment_header* shdr = tctx->shdr; + de265_image* img = tctx->img; + const seq_parameter_set& sps = img->get_sps(); + const pic_parameter_set& pps = img->get_pps(); + + logtrace(LogSlice,"# read_sao(%d,%d)\n",xCtb,yCtb); + + sao_info saoinfo; + memset(&saoinfo,0,sizeof(sao_info)); + logtrace(LogSlice,"sizeof saoinfo: %d\n",sizeof(sao_info)); + + + char sao_merge_left_flag = 0; + char sao_merge_up_flag = 0; + + if (xCtb>0) { + //char leftCtbInSliceSeg = (CtbAddrInSliceSeg>0); + char leftCtbInSliceSeg = (tctx->CtbAddrInRS > shdr->SliceAddrRS); + char leftCtbInTile = (pps.TileIdRS[xCtb + yCtb * sps.PicWidthInCtbsY] == + pps.TileIdRS[xCtb-1 + yCtb * sps.PicWidthInCtbsY]); + + if (leftCtbInSliceSeg && leftCtbInTile) { + sao_merge_left_flag = decode_sao_merge_flag(tctx); + logtrace(LogSlice,"sao_merge_left_flag: %d\n",sao_merge_left_flag); + } + } + + if (yCtb>0 && sao_merge_left_flag==0) { + logtrace(LogSlice,"CtbAddrInRS:%d PicWidthInCtbsY:%d slice_segment_address:%d\n", + tctx->CtbAddrInRS, + sps.PicWidthInCtbsY, + shdr->slice_segment_address); + char upCtbInSliceSeg = (tctx->CtbAddrInRS - sps.PicWidthInCtbsY) >= shdr->SliceAddrRS; + char upCtbInTile = (pps.TileIdRS[xCtb + yCtb * sps.PicWidthInCtbsY] == + pps.TileIdRS[xCtb + (yCtb-1) * sps.PicWidthInCtbsY]); + + if (upCtbInSliceSeg && upCtbInTile) { + sao_merge_up_flag = decode_sao_merge_flag(tctx); + logtrace(LogSlice,"sao_merge_up_flag: %d\n",sao_merge_up_flag); + } + } + + if (!sao_merge_up_flag && !sao_merge_left_flag) { + int nChroma = 3; + if (sps.ChromaArrayType == CHROMA_MONO) nChroma=1; + + for (int cIdx=0; cIdxslice_sao_luma_flag && cIdx==0) || + (shdr->slice_sao_chroma_flag && cIdx>0)) { + + uint8_t SaoTypeIdx = 0; + + if (cIdx==0) { + char sao_type_idx_luma = decode_sao_type_idx(tctx); + logtrace(LogSlice,"sao_type_idx_luma: %d\n", sao_type_idx_luma); + saoinfo.SaoTypeIdx = SaoTypeIdx = sao_type_idx_luma; + } + else if (cIdx==1) { + char sao_type_idx_chroma = decode_sao_type_idx(tctx); + logtrace(LogSlice,"sao_type_idx_chroma: %d\n", sao_type_idx_chroma); + SaoTypeIdx = sao_type_idx_chroma; + saoinfo.SaoTypeIdx |= SaoTypeIdx<<(2*1); + saoinfo.SaoTypeIdx |= SaoTypeIdx<<(2*2); // set for both chroma components + } + else { + // SaoTypeIdx = 0 + + SaoTypeIdx = (saoinfo.SaoTypeIdx >> (2*cIdx)) & 0x3; + } + + if (SaoTypeIdx != 0) { + for (int i=0;i<4;i++) { + saoinfo.saoOffsetVal[cIdx][i] = decode_sao_offset_abs(tctx, img->get_bit_depth(cIdx)); + logtrace(LogSlice,"saoOffsetVal[%d][%d] = %d\n",cIdx,i, saoinfo.saoOffsetVal[cIdx][i]); + } + + int sign[4]; + if (SaoTypeIdx==1) { + for (int i=0;i<4;i++) { + if (saoinfo.saoOffsetVal[cIdx][i] != 0) { + sign[i] = decode_sao_offset_sign(tctx) ? -1 : 1; + } + else { + sign[i] = 0; // not really required, but compiler warns about uninitialized values + } + } + + saoinfo.sao_band_position[cIdx] = decode_sao_band_position(tctx); + } + else { + uint8_t SaoEoClass = 0; + + sign[0] = sign[1] = 1; + sign[2] = sign[3] = -1; + + if (cIdx==0) { + saoinfo.SaoEoClass = SaoEoClass = decode_sao_class(tctx); + } + else if (cIdx==1) { + SaoEoClass = decode_sao_class(tctx); + saoinfo.SaoEoClass |= SaoEoClass << (2*1); + saoinfo.SaoEoClass |= SaoEoClass << (2*2); + } + + logtrace(LogSlice,"SaoEoClass[%d] = %d\n",cIdx,SaoEoClass); + } + + int log2OffsetScale; + + if (cIdx==0) { + log2OffsetScale = pps.range_extension.log2_sao_offset_scale_luma; + } + else { + log2OffsetScale = pps.range_extension.log2_sao_offset_scale_chroma; + } + + for (int i=0;i<4;i++) { + saoinfo.saoOffsetVal[cIdx][i] = sign[i]*(saoinfo.saoOffsetVal[cIdx][i] << log2OffsetScale); + } + } + } + } + + img->set_sao_info(xCtb,yCtb, &saoinfo); + } + + + if (sao_merge_left_flag) { + img->set_sao_info(xCtb,yCtb, img->get_sao_info(xCtb-1,yCtb)); + } + + if (sao_merge_up_flag) { + img->set_sao_info(xCtb,yCtb, img->get_sao_info(xCtb,yCtb-1)); + } +} + + +void read_coding_tree_unit(thread_context* tctx) +{ + slice_segment_header* shdr = tctx->shdr; + de265_image* img = tctx->img; + const seq_parameter_set& sps = img->get_sps(); + + int xCtb = (tctx->CtbAddrInRS % sps.PicWidthInCtbsY); + int yCtb = (tctx->CtbAddrInRS / sps.PicWidthInCtbsY); + int xCtbPixels = xCtb << sps.Log2CtbSizeY; + int yCtbPixels = yCtb << sps.Log2CtbSizeY; + + logtrace(LogSlice,"----- decode CTB %d;%d (%d;%d) POC=%d, SliceAddrRS=%d\n", + xCtbPixels,yCtbPixels, xCtb,yCtb, + tctx->img->PicOrderCntVal, tctx->shdr->SliceAddrRS); + + img->set_SliceAddrRS(xCtb, yCtb, tctx->shdr->SliceAddrRS); + + img->set_SliceHeaderIndex(xCtbPixels,yCtbPixels, shdr->slice_index); + + int CtbAddrInSliceSeg = tctx->CtbAddrInRS - shdr->slice_segment_address; + + if (shdr->slice_sao_luma_flag || shdr->slice_sao_chroma_flag) + { + read_sao(tctx, xCtb,yCtb, CtbAddrInSliceSeg); + } + + read_coding_quadtree(tctx, xCtbPixels, yCtbPixels, sps.Log2CtbSizeY, 0); +} + + +LIBDE265_INLINE static int luma_pos_to_ctbAddrRS(const seq_parameter_set* sps, int x,int y) +{ + int ctbX = x >> sps->Log2CtbSizeY; + int ctbY = y >> sps->Log2CtbSizeY; + + return ctbY * sps->PicWidthInCtbsY + ctbX; +} + + +int check_CTB_available(const de265_image* img, + int xC,int yC, int xN,int yN) +{ + // check whether neighbor is outside of frame + + if (xN < 0 || yN < 0) { return 0; } + if (xN >= img->get_sps().pic_width_in_luma_samples) { return 0; } + if (yN >= img->get_sps().pic_height_in_luma_samples) { return 0; } + + + int current_ctbAddrRS = luma_pos_to_ctbAddrRS(&img->get_sps(), xC,yC); + int neighbor_ctbAddrRS = luma_pos_to_ctbAddrRS(&img->get_sps(), xN,yN); + + // TODO: check if this is correct (6.4.1) + + if (img->get_SliceAddrRS_atCtbRS(current_ctbAddrRS) != + img->get_SliceAddrRS_atCtbRS(neighbor_ctbAddrRS)) { + return 0; + } + + // check if both CTBs are in the same tile. + + if (img->get_pps().TileIdRS[current_ctbAddrRS] != + img->get_pps().TileIdRS[neighbor_ctbAddrRS]) { + return 0; + } + + return 1; +} + + +int residual_coding(thread_context* tctx, + int x0, int y0, // position of TU in frame + int log2TrafoSize, + int cIdx) +{ + logtrace(LogSlice,"- residual_coding x0:%d y0:%d log2TrafoSize:%d cIdx:%d\n",x0,y0,log2TrafoSize,cIdx); + + //slice_segment_header* shdr = tctx->shdr; + + de265_image* img = tctx->img; + const seq_parameter_set& sps = img->get_sps(); + const pic_parameter_set& pps = img->get_pps(); + + enum PredMode PredMode = img->get_pred_mode(x0,y0); + + if (cIdx==0) { + img->set_nonzero_coefficient(x0,y0,log2TrafoSize); + } + + + if (pps.transform_skip_enabled_flag && + !tctx->cu_transquant_bypass_flag && + (log2TrafoSize <= pps.Log2MaxTransformSkipSize)) + { + tctx->transform_skip_flag[cIdx] = decode_transform_skip_flag(tctx,cIdx); + } + else + { + tctx->transform_skip_flag[cIdx] = 0; + } + + + tctx->explicit_rdpcm_flag = false; + + if (PredMode == MODE_INTER && sps.range_extension.explicit_rdpcm_enabled_flag && + ( tctx->transform_skip_flag[cIdx] || tctx->cu_transquant_bypass_flag)) + { + tctx->explicit_rdpcm_flag = decode_explicit_rdpcm_flag(tctx,cIdx); + if (tctx->explicit_rdpcm_flag) { + tctx->explicit_rdpcm_dir = decode_explicit_rdpcm_dir(tctx,cIdx); + } + + //printf("EXPLICIT RDPCM %d;%d\n",x0,y0); + } + else + { + tctx->explicit_rdpcm_flag = false; + } + + + + // sbType for persistent_rice_adaptation_enabled_flag + + int sbType = (cIdx==0) ? 2 : 0; + if (tctx->transform_skip_flag[cIdx] || tctx->cu_transquant_bypass_flag) { + sbType++; + } + + + // --- decode position of last coded coefficient --- + + int last_significant_coeff_x_prefix = + decode_last_significant_coeff_prefix(tctx,log2TrafoSize,cIdx, + &tctx->ctx_model[CONTEXT_MODEL_LAST_SIGNIFICANT_COEFFICIENT_X_PREFIX]); + + int last_significant_coeff_y_prefix = + decode_last_significant_coeff_prefix(tctx,log2TrafoSize,cIdx, + &tctx->ctx_model[CONTEXT_MODEL_LAST_SIGNIFICANT_COEFFICIENT_Y_PREFIX]); + + + // TODO: we can combine both FL-bypass calls into one, but the gain may be limited... + + int LastSignificantCoeffX; + if (last_significant_coeff_x_prefix > 3) { + int nBits = (last_significant_coeff_x_prefix>>1)-1; + int last_significant_coeff_x_suffix = decode_CABAC_FL_bypass(&tctx->cabac_decoder,nBits); + + LastSignificantCoeffX = + ((2+(last_significant_coeff_x_prefix & 1)) << nBits) + last_significant_coeff_x_suffix; + } + else { + LastSignificantCoeffX = last_significant_coeff_x_prefix; + } + + int LastSignificantCoeffY; + if (last_significant_coeff_y_prefix > 3) { + int nBits = (last_significant_coeff_y_prefix>>1)-1; + int last_significant_coeff_y_suffix = decode_CABAC_FL_bypass(&tctx->cabac_decoder,nBits); + + LastSignificantCoeffY = + ((2+(last_significant_coeff_y_prefix & 1)) << nBits) + last_significant_coeff_y_suffix; + } + else { + LastSignificantCoeffY = last_significant_coeff_y_prefix; + } + + + + // --- determine scanIdx --- + + int scanIdx; + + if (PredMode == MODE_INTRA) { + if (cIdx==0) { + scanIdx = get_intra_scan_idx(log2TrafoSize, img->get_IntraPredMode(x0,y0), cIdx, &sps); + //printf("luma scan idx=%d <- intra mode=%d\n",scanIdx, img->get_IntraPredMode(x0,y0)); + } + else { + scanIdx = get_intra_scan_idx(log2TrafoSize, img->get_IntraPredModeC(x0,y0), cIdx, &sps); + //printf("chroma scan idx=%d <- intra mode=%d chroma:%d trsize:%d\n",scanIdx, + // img->get_IntraPredModeC(x0,y0), sps->chroma_format_idc, 1<nCoeff[cIdx] = 0; + + + // i - subblock index + // n - coefficient index in subblock + + for (int i=lastSubBlock;i>=0;i--) { + position S = ScanOrderSub[i]; + int inferSbDcSigCoeffFlag=0; + + logtrace(LogSlice,"sub block scan idx: %d\n",i); + + + // --- check whether this sub-block is coded --- + + int sub_block_is_coded = 0; + + if ((i0)) { + sub_block_is_coded = decode_coded_sub_block_flag(tctx, cIdx, + coded_sub_block_neighbors[S.x+S.y*sbWidth]); + inferSbDcSigCoeffFlag=1; + } + else if (i==0 || i==lastSubBlock) { + // first (DC) and last sub-block are always coded + // - the first will most probably contain coefficients + // - the last obviously contains the last coded coefficient + + sub_block_is_coded = 1; + } + + if (sub_block_is_coded) { + if (S.x > 0) coded_sub_block_neighbors[S.x-1 + S.y *sbWidth] |= 1; + if (S.y > 0) coded_sub_block_neighbors[S.x + (S.y-1)*sbWidth] |= 2; + } + + + // ----- find significant coefficients in this sub-block ----- + + int16_t coeff_value[16]; + int8_t coeff_scan_pos[16]; + int8_t coeff_sign[16]; + int8_t coeff_has_max_base_level[16]; + int nCoefficients=0; + + + if (sub_block_is_coded) { + int x0 = S.x<<2; + int y0 = S.y<<2; + + int log2w = log2TrafoSize-2; + int prevCsbf = coded_sub_block_neighbors[S.x+S.y*sbWidth]; + uint8_t* ctxIdxMap = ctxIdxLookup[log2w][!!cIdx][!!scanIdx][prevCsbf]; + + logdebug(LogSlice,"log2w:%d cIdx:%d scanIdx:%d prevCsbf:%d\n", + log2w,cIdx,scanIdx,prevCsbf); + + + // set the last coded coefficient in the last subblock + + int last_coeff = (i==lastSubBlock) ? lastScanPos-1 : 15; + + if (i==lastSubBlock) { + coeff_value[nCoefficients] = 1; + coeff_has_max_base_level[nCoefficients] = 1; + coeff_scan_pos[nCoefficients] = lastScanPos; + nCoefficients++; + } + + + // --- decode all coefficients' significant_coeff flags except for the DC coefficient --- + + for (int n= last_coeff ; n>0 ; n--) { + int subX = ScanOrderPos[n].x; + int subY = ScanOrderPos[n].y; + xC = x0 + subX; + yC = y0 + subY; + + + // for all AC coefficients in sub-block, a significant_coeff flag is coded + + int ctxInc; + if (sps.range_extension.transform_skip_context_enabled_flag && + (tctx->cu_transquant_bypass_flag || tctx->transform_skip_flag[cIdx])) { + ctxInc = ( cIdx == 0 ) ? 42 : (16+27); + } + else { + ctxInc = ctxIdxMap[xC+(yC<=0) // last coded coefficient (always set to 1) is not the DC coefficient + { + if (inferSbDcSigCoeffFlag==0) { + // if we cannot infert the DC coefficient, it is coded + + int ctxInc; + if (sps.range_extension.transform_skip_context_enabled_flag && + (tctx->cu_transquant_bypass_flag || tctx->transform_skip_flag[cIdx])) { + ctxInc = ( cIdx == 0 ) ? 42 : (16+27); + } + else { + ctxInc = ctxIdxMap[x0+(y0<0) { ctxSet=0; } + else { ctxSet=2; } + + if (c1==0) { ctxSet++; } + c1=1; + + + // --- decode greater-1 flags --- + + int newLastGreater1ScanPos=-1; + + int lastGreater1Coefficient = libde265_min(8,nCoefficients); + for (int c=0;c0) { + c1++; + } + } + } + + firstSubblock = false; + lastSubblock_greater1Ctx = lastInvocation_greater1Ctx; + + + // --- decode greater-2 flag --- + + if (newLastGreater1ScanPos != -1) { + int flag = decode_coeff_abs_level_greater2(tctx,cIdx, lastInvocation_ctxSet); + coeff_value[newLastGreater1ScanPos] += flag; + coeff_has_max_base_level[newLastGreater1ScanPos] = flag; + } + + + // --- decode coefficient signs --- + + int signHidden; + + + IntraPredMode predModeIntra; + if (cIdx==0) predModeIntra = img->get_IntraPredMode(x0,y0); + else predModeIntra = img->get_IntraPredModeC(x0,y0); + + + if (tctx->cu_transquant_bypass_flag || + (PredMode == MODE_INTRA && + sps.range_extension.implicit_rdpcm_enabled_flag && + tctx->transform_skip_flag[cIdx] && + ( predModeIntra == 10 || predModeIntra == 26 )) || + tctx->explicit_rdpcm_flag) + { + signHidden = 0; + } + else + { + signHidden = (coeff_scan_pos[0]-coeff_scan_pos[nCoefficients-1] > 3); + } + + + for (int n=0;ncabac_decoder); + logtrace(LogSlice,"sign[%d] = %d\n", n, coeff_sign[n]); + } + + // n==nCoefficients-1 + if (!pps.sign_data_hiding_flag || !signHidden) { + coeff_sign[nCoefficients-1] = decode_CABAC_bypass(&tctx->cabac_decoder); + logtrace(LogSlice,"sign[%d] = %d\n", nCoefficients-1, coeff_sign[nCoefficients-1]); + } + else { + coeff_sign[nCoefficients-1] = 0; + } + + + // --- decode coefficient value --- + + int sumAbsLevel=0; + int uiGoRiceParam; + + if (sps.range_extension.persistent_rice_adaptation_enabled_flag==0) { + uiGoRiceParam = 0; + } + else { + uiGoRiceParam = tctx->StatCoeff[sbType]/4; + } + + // printf("initial uiGoRiceParam=%d\n",uiGoRiceParam); + bool firstCoeffWithAbsLevelRemaining = true; + + for (int n=0;n 3*(1<4) uiGoRiceParam=4; + } + } + else { + if (baseLevel + coeff_abs_level_remaining > 3*(1<= (3 << (tctx->StatCoeff[sbType]/4 ))) { + tctx->StatCoeff[sbType]++; + } + else if (2*coeff_abs_level_remaining < (1 << (tctx->StatCoeff[sbType]/4 )) && + tctx->StatCoeff[sbType] > 0) { + tctx->StatCoeff[sbType]--; + } + } + + firstCoeffWithAbsLevelRemaining=false; + } + else { + coeff_abs_level_remaining = 0; + } + + logtrace(LogSlice, "coeff_abs_level_remaining=%d\n",coeff_abs_level_remaining); + + + int16_t currCoeff = baseLevel + coeff_abs_level_remaining; + if (coeff_sign[n]) { + currCoeff = -currCoeff; + } + + if (pps.sign_data_hiding_flag && signHidden) { + sumAbsLevel += baseLevel + coeff_abs_level_remaining; + + if (n==nCoefficients-1 && (sumAbsLevel & 1)) { + currCoeff = -currCoeff; + } + } + + logtrace(LogSlice, "quantized coefficient=%d\n",currCoeff); + +#ifdef DE265_LOG_TRACE + //TransCoeffLevel[yC*CoeffStride + xC] = currCoeff; +#endif + + // put coefficient in list + int p = coeff_scan_pos[n]; + xC = (S.x<<2) + ScanOrderPos[p].x; + yC = (S.y<<2) + ScanOrderPos[p].y; + + tctx->coeffList[cIdx][ tctx->nCoeff[cIdx] ] = currCoeff; + tctx->coeffPos [cIdx][ tctx->nCoeff[cIdx] ] = xC + yC*CoeffStride; + tctx->nCoeff[cIdx]++; + + //printf("%d ",currCoeff); + } // iterate through coefficients in sub-block + + //printf(" (%d;%d)\n",x0,y0); + + } // if nonZero + } // next sub-block + + return DE265_OK; +} + + +static void decode_TU(thread_context* tctx, + int x0,int y0, + int xCUBase,int yCUBase, + int nT, int cIdx, enum PredMode cuPredMode, bool cbf) +{ + de265_image* img = tctx->img; + const seq_parameter_set& sps = img->get_sps(); + + int residualDpcm = 0; + + if (cuPredMode == MODE_INTRA) // if intra mode + { + enum IntraPredMode intraPredMode; + + if (cIdx==0) { + intraPredMode = img->get_IntraPredMode(x0,y0); + } + else { + const int SubWidthC = sps.SubWidthC; + const int SubHeightC = sps.SubHeightC; + + intraPredMode = img->get_IntraPredModeC(x0*SubWidthC,y0*SubHeightC); + } + + if (intraPredMode<0 || intraPredMode>=35) { + // TODO: ERROR + intraPredMode = INTRA_DC; + } + + decode_intra_prediction(img, x0,y0, intraPredMode, nT, cIdx); + + + residualDpcm = sps.range_extension.implicit_rdpcm_enabled_flag && + (tctx->cu_transquant_bypass_flag || tctx->transform_skip_flag[cIdx]) && + (intraPredMode == 10 || intraPredMode == 26); + + if (residualDpcm && intraPredMode == 26) + residualDpcm = 2; + } + else // INTER + { + if (tctx->explicit_rdpcm_flag) { + residualDpcm = (tctx->explicit_rdpcm_dir ? 2 : 1); + } + } + + if (cbf) { + scale_coefficients(tctx, x0,y0, xCUBase,yCUBase, nT, cIdx, + tctx->transform_skip_flag[cIdx], cuPredMode==MODE_INTRA, residualDpcm); + } + /* + else if (!cbf && cIdx==0) { + memset(tctx->residual_luma,0,32*32*sizeof(int32_t)); + } + */ + else if (!cbf && cIdx!=0 && tctx->ResScaleVal) { + // --- cross-component-prediction when CBF==0 --- + + tctx->nCoeff[cIdx] = 0; + residualDpcm=0; + + scale_coefficients(tctx, x0,y0, xCUBase,yCUBase, nT, cIdx, + tctx->transform_skip_flag[cIdx], cuPredMode==MODE_INTRA, residualDpcm); + } +} + + +static int decode_log2_res_scale_abs_plus1(thread_context* tctx, int cIdxMinus1) +{ + //const int context = (cIdx==0) ? 0 : 1; + + logtrace(LogSlice,"# log2_res_scale_abs_plus1 (c=%d)\n",cIdxMinus1); + + int value = 0; + int cMax = 4; + for (int binIdx=0;binIdxcabac_decoder, + &tctx->ctx_model[CONTEXT_MODEL_LOG2_RES_SCALE_ABS_PLUS1+ctxIdxInc]); + if (!bit) break; + value++; + } + + logtrace(LogSymbols,"$1 log2_res_scale_abs_plus1=%d\n",value); + + return value; +} + + +static int decode_res_scale_sign_flag(thread_context* tctx, int cIdxMinus1) +{ + //const int context = (cIdx==0) ? 0 : 1; + + logtrace(LogSlice,"# res_scale_sign_flag (c=%d)\n",cIdxMinus1); + + int bit = decode_CABAC_bit(&tctx->cabac_decoder, + &tctx->ctx_model[CONTEXT_MODEL_RES_SCALE_SIGN_FLAG+cIdxMinus1]); + + logtrace(LogSymbols,"$1 res_scale_sign_flag=%d\n",bit); + + return bit; +} + + +static void read_cross_comp_pred(thread_context* tctx, int cIdxMinus1) +{ + int log2_res_scale_abs_plus1 = decode_log2_res_scale_abs_plus1(tctx,cIdxMinus1); + int ResScaleVal; + + if (log2_res_scale_abs_plus1 != 0) { + int res_scale_sign_flag = decode_res_scale_sign_flag(tctx,cIdxMinus1); + + ResScaleVal = 1 << (log2_res_scale_abs_plus1 - 1); + ResScaleVal *= 1 - 2 * res_scale_sign_flag; + } + else { + ResScaleVal = 0; + } + + tctx->ResScaleVal = ResScaleVal; +} + + +int read_transform_unit(thread_context* tctx, + int x0, int y0, // position of TU in frame + int xBase, int yBase, // position of parent TU in frame + int xCUBase,int yCUBase, // position of CU in frame + int log2TrafoSize, + int trafoDepth, + int blkIdx, + int cbf_luma, int cbf_cb, int cbf_cr) +{ + logtrace(LogSlice,"- read_transform_unit x0:%d y0:%d xBase:%d yBase:%d nT:%d cbf:%d:%d:%d\n", + x0,y0,xBase,yBase, 1<img->get_sps(); + + const int ChromaArrayType = sps.ChromaArrayType; + + int log2TrafoSizeC = (ChromaArrayType==CHROMA_444 ? log2TrafoSize : log2TrafoSize-1); + log2TrafoSizeC = libde265_max(2, log2TrafoSizeC); + + const int cbfLuma = cbf_luma; + const int cbfChroma = cbf_cb | cbf_cr; + + tctx->transform_skip_flag[0]=0; + tctx->transform_skip_flag[1]=0; + tctx->transform_skip_flag[2]=0; + + tctx->explicit_rdpcm_flag = false; + + + enum PredMode cuPredMode = tctx->img->get_pred_mode(x0,y0); + + if (cbfLuma || cbfChroma) + { + bool doDecodeQuantParameters = false; + + if (tctx->img->get_pps().cu_qp_delta_enabled_flag && + !tctx->IsCuQpDeltaCoded) { + + int cu_qp_delta_abs = decode_cu_qp_delta_abs(tctx); + int cu_qp_delta_sign=0; + if (cu_qp_delta_abs) { + cu_qp_delta_sign = decode_CABAC_bypass(&tctx->cabac_decoder); + } + + tctx->IsCuQpDeltaCoded = 1; + tctx->CuQpDelta = cu_qp_delta_abs*(1-2*cu_qp_delta_sign); + + //printf("read cu_qp_delta (%d;%d) = %d\n",x0,y0,tctx->CuQpDelta); + + logtrace(LogSlice,"cu_qp_delta_abs = %d\n",cu_qp_delta_abs); + logtrace(LogSlice,"cu_qp_delta_sign = %d\n",cu_qp_delta_sign); + logtrace(LogSlice,"CuQpDelta = %d\n",tctx->CuQpDelta); + + doDecodeQuantParameters = true; + //decode_quantization_parameters(tctx, x0,y0, xCUBase, yCUBase); + } + + if (tctx->shdr->cu_chroma_qp_offset_enabled_flag && cbfChroma && + !tctx->cu_transquant_bypass_flag && !tctx->IsCuChromaQpOffsetCoded ) { + logtrace(LogSlice,"# cu_chroma_qp_offset_flag\n"); + + int cu_chroma_qp_offset_flag = decode_CABAC_bit(&tctx->cabac_decoder, + &tctx->ctx_model[CONTEXT_MODEL_CU_CHROMA_QP_OFFSET_FLAG]); + + + const pic_parameter_set& pps = tctx->img->get_pps(); + + int cu_chroma_qp_offset_idx = 0; + if (cu_chroma_qp_offset_flag && pps.range_extension.chroma_qp_offset_list_len > 1) { + cu_chroma_qp_offset_idx = decode_CABAC_bit(&tctx->cabac_decoder, + &tctx->ctx_model[CONTEXT_MODEL_CU_CHROMA_QP_OFFSET_IDX]); + } + + tctx->IsCuChromaQpOffsetCoded = 1; + + if (cu_chroma_qp_offset_flag) { + tctx->CuQpOffsetCb = pps.range_extension.cb_qp_offset_list[ cu_chroma_qp_offset_idx ]; + tctx->CuQpOffsetCr = pps.range_extension.cr_qp_offset_list[ cu_chroma_qp_offset_idx ]; + } + else { + tctx->CuQpOffsetCb = 0; + tctx->CuQpOffsetCr = 0; + } + + doDecodeQuantParameters = true; + //decode_quantization_parameters(tctx, x0,y0, xCUBase, yCUBase); + } + + + if (doDecodeQuantParameters) { + decode_quantization_parameters(tctx, x0,y0, xCUBase, yCUBase); + } + } + + // position of TU in local CU + int xL = x0 - xCUBase; + int yL = y0 - yCUBase; + int nT = 1<ResScaleVal = 0; + + int err; + if (cbf_luma) { + if ((err=residual_coding(tctx,x0,y0, log2TrafoSize,0)) != DE265_OK) return err; + } + + decode_TU(tctx, x0,y0, xCUBase,yCUBase, nT, 0, cuPredMode, cbf_luma); + + + // --- chroma --- + + const int yOffset422 = 1<2 || ChromaArrayType == CHROMA_444) { + // TODO: cross-component prediction + + const bool do_cross_component_prediction = + (tctx->img->get_pps().range_extension.cross_component_prediction_enabled_flag && + cbf_luma && + (cuPredMode == MODE_INTER || tctx->img->is_IntraPredModeC_Mode4(x0,y0))); + + if (do_cross_component_prediction) { + read_cross_comp_pred(tctx, 0); + } + else { + tctx->ResScaleVal = 0; + } + + { + if (cbf_cb & 1) { + if ((err=residual_coding(tctx,x0,y0,log2TrafoSizeC,1)) != DE265_OK) return err; + } + + if (sps.ChromaArrayType != CHROMA_MONO) { + decode_TU(tctx, + x0/SubWidthC,y0/SubHeightC, + xCUBase/SubWidthC,yCUBase/SubHeightC, nTC, 1, cuPredMode, cbf_cb & 1); + } + } + + // 4:2:2 + if (ChromaArrayType == CHROMA_422) { + const int yOffset = 1<ResScaleVal = 0; + } + + { + if (cbf_cr & 1) { + if ((err=residual_coding(tctx,x0,y0,log2TrafoSizeC,2)) != DE265_OK) return err; + } + + if (sps.ChromaArrayType != CHROMA_MONO) { + decode_TU(tctx, + x0/SubWidthC,y0/SubHeightC, + xCUBase/SubWidthC,yCUBase/SubHeightC, + nTC, 2, cuPredMode, cbf_cr & 1); + } + } + + // 4:2:2 + if (ChromaArrayType == CHROMA_422) { + const int yOffset = 1<get_width(0); + int h = img->get_height(0); + + for (int y=0;yget_log2CbSize(x,y)); + } + printf("\n"); + } +} + + +void read_transform_tree(thread_context* tctx, + int x0, int y0, // position of TU in frame + int xBase, int yBase, // position of parent TU in frame + int xCUBase, int yCUBase, // position of CU in frame + int log2TrafoSize, + int trafoDepth, + int blkIdx, + int MaxTrafoDepth, + int IntraSplitFlag, + enum PredMode cuPredMode, + uint8_t parent_cbf_cb,uint8_t parent_cbf_cr) +{ + logtrace(LogSlice,"- read_transform_tree (interleaved) x0:%d y0:%d xBase:%d yBase:%d " + "log2TrafoSize:%d trafoDepth:%d MaxTrafoDepth:%d parent-cbf-cb:%d parent-cbf-cr:%d\n", + x0,y0,xBase,yBase,log2TrafoSize,trafoDepth,MaxTrafoDepth,parent_cbf_cb,parent_cbf_cr); + + de265_image* img = tctx->img; + const seq_parameter_set& sps = img->get_sps(); + + int split_transform_flag; + + enum PredMode PredMode = img->get_pred_mode(x0,y0); + assert(PredMode == cuPredMode); + + /* If TrafoSize is larger than maximum size -> split automatically + If TrafoSize is at minimum size -> do not split + If maximum transformation depth is reached -> do not split + If intra-prediction is NxN mode -> split automatically (only at level 0) + Otherwise -> read split flag + */ + if (log2TrafoSize <= sps.Log2MaxTrafoSize && + log2TrafoSize > sps.Log2MinTrafoSize && + trafoDepth < MaxTrafoDepth && + !(IntraSplitFlag && trafoDepth==0)) + { + split_transform_flag = decode_split_transform_flag(tctx, log2TrafoSize); + } + else + { + enum PartMode PartMode = img->get_PartMode(x0,y0); + + int interSplitFlag= (sps.max_transform_hierarchy_depth_inter==0 && + trafoDepth == 0 && + PredMode == MODE_INTER && + PartMode != PART_2Nx2N); + + split_transform_flag = (log2TrafoSize > sps.Log2MaxTrafoSize || + (IntraSplitFlag==1 && trafoDepth==0) || + interSplitFlag==1) ? 1:0; + } + + if (split_transform_flag) { + logtrace(LogSlice,"set_split_transform_flag(%d,%d, %d)\n",x0,y0,trafoDepth); + img->set_split_transform_flag(x0,y0,trafoDepth); + } + + int cbf_cb=-1; + int cbf_cr=-1; + + // CBF_CB/CR flags are encoded like this: + // 4:2:0 and 4:4:4 modes: binary flag in bit 0 + // 4:2:2 mode: bit 0: top block, bit 1: bottom block + + if ((log2TrafoSize>2 && sps.ChromaArrayType != CHROMA_MONO) || + sps.ChromaArrayType == CHROMA_444) { + // we do not have to test for trafoDepth==0, because parent_cbf_cb is 1 at depth 0 + if (/*trafoDepth==0 ||*/ parent_cbf_cb) { + cbf_cb = decode_cbf_chroma(tctx,trafoDepth); + + if (sps.ChromaArrayType == CHROMA_422 && (!split_transform_flag || log2TrafoSize==3)) { + cbf_cb |= (decode_cbf_chroma(tctx,trafoDepth) << 1); + } + } + + // we do not have to test for trafoDepth==0, because parent_cbf_cb is 1 at depth 0 + if (/*trafoDepth==0 ||*/ parent_cbf_cr) { + cbf_cr = decode_cbf_chroma(tctx,trafoDepth); + + if (sps.ChromaArrayType == CHROMA_422 && (!split_transform_flag || log2TrafoSize==3)) { + cbf_cr |= (decode_cbf_chroma(tctx,trafoDepth) << 1); + } + } + } + + //printf("CBF: cb:%d cr:%d\n",cbf_cb,cbf_cr); + + // cbf_cr/cbf_cb not present in bitstream -> induce values + + if (cbf_cb<0) { + assert(!(trafoDepth==0 && log2TrafoSize==2)); + + /* The standard specifies to check trafoDepth>0 AND log2TrafoSize==2. + However, I think that trafoDepth>0 is redundant as a CB is always + at least 8x8 and hence trafoDepth>0. + */ + + if (trafoDepth>0 && log2TrafoSize==2) { + cbf_cb = parent_cbf_cb; + } else { + cbf_cb=0; + } + } + + if (cbf_cr<0) { + if (trafoDepth>0 && log2TrafoSize==2) { + cbf_cr = parent_cbf_cr; + } else { + cbf_cr=0; + } + } + + if (split_transform_flag) { + int x1 = x0 + (1<<(log2TrafoSize-1)); + int y1 = y0 + (1<<(log2TrafoSize-1)); + + logtrace(LogSlice,"transform split.\n"); + + read_transform_tree(tctx, x0,y0, x0,y0, xCUBase,yCUBase, log2TrafoSize-1, trafoDepth+1, 0, + MaxTrafoDepth,IntraSplitFlag, cuPredMode, cbf_cb,cbf_cr); + read_transform_tree(tctx, x1,y0, x0,y0, xCUBase,yCUBase, log2TrafoSize-1, trafoDepth+1, 1, + MaxTrafoDepth,IntraSplitFlag, cuPredMode, cbf_cb,cbf_cr); + read_transform_tree(tctx, x0,y1, x0,y0, xCUBase,yCUBase, log2TrafoSize-1, trafoDepth+1, 2, + MaxTrafoDepth,IntraSplitFlag, cuPredMode, cbf_cb,cbf_cr); + read_transform_tree(tctx, x1,y1, x0,y0, xCUBase,yCUBase, log2TrafoSize-1, trafoDepth+1, 3, + MaxTrafoDepth,IntraSplitFlag, cuPredMode, cbf_cb,cbf_cr); + } + else { + int cbf_luma; + + if (PredMode==MODE_INTRA || trafoDepth!=0 || cbf_cb || cbf_cr) { + cbf_luma = decode_cbf_luma(tctx,trafoDepth); + } + else { + /* There cannot be INTER blocks with no residual data. + That case is already handled with rqt_root_cbf. + */ + + cbf_luma = 1; + } + + logtrace(LogSlice,"call read_transform_unit %d/%d\n",x0,y0); + + read_transform_unit(tctx, x0,y0,xBase,yBase, xCUBase,yCUBase, log2TrafoSize,trafoDepth, blkIdx, + cbf_luma, cbf_cb, cbf_cr); + } +} + + +const char* part_mode_name(enum PartMode pm) +{ + switch (pm) { + case PART_2Nx2N: return "2Nx2N"; + case PART_2NxN: return "2NxN"; + case PART_Nx2N: return "Nx2N"; + case PART_NxN: return "NxN"; + case PART_2NxnU: return "2NxnU"; + case PART_2NxnD: return "2NxnD"; + case PART_nLx2N: return "nLx2N"; + case PART_nRx2N: return "nRx2N"; + } + + return "undefined part mode"; +} + + +void read_mvd_coding(thread_context* tctx, + int x0,int y0, int refList) +{ + int abs_mvd_greater0_flag[2]; + abs_mvd_greater0_flag[0] = decode_CABAC_bit(&tctx->cabac_decoder, + &tctx->ctx_model[CONTEXT_MODEL_ABS_MVD_GREATER01_FLAG+0]); + abs_mvd_greater0_flag[1] = decode_CABAC_bit(&tctx->cabac_decoder, + &tctx->ctx_model[CONTEXT_MODEL_ABS_MVD_GREATER01_FLAG+0]); + + int abs_mvd_greater1_flag[2]; + if (abs_mvd_greater0_flag[0]) { + abs_mvd_greater1_flag[0] = decode_CABAC_bit(&tctx->cabac_decoder, + &tctx->ctx_model[CONTEXT_MODEL_ABS_MVD_GREATER01_FLAG+1]); + } + else { + abs_mvd_greater1_flag[0]=0; + } + + if (abs_mvd_greater0_flag[1]) { + abs_mvd_greater1_flag[1] = decode_CABAC_bit(&tctx->cabac_decoder, + &tctx->ctx_model[CONTEXT_MODEL_ABS_MVD_GREATER01_FLAG+1]); + } + else { + abs_mvd_greater1_flag[1]=0; + } + + + int abs_mvd_minus2[2]; + int mvd_sign_flag[2]; + int value[2]; + + for (int c=0;c<2;c++) { + if (abs_mvd_greater0_flag[c]) { + if (abs_mvd_greater1_flag[c]) { + abs_mvd_minus2[c] = decode_CABAC_EGk_bypass(&tctx->cabac_decoder, 1); + } + else { + abs_mvd_minus2[c] = abs_mvd_greater1_flag[c] -1; + } + + mvd_sign_flag[c] = decode_CABAC_bypass(&tctx->cabac_decoder); + + value[c] = abs_mvd_minus2[c]+2; + if (mvd_sign_flag[c]) { value[c] = -value[c]; } + } + else { + value[c] = 0; + } + } + + //set_mvd(tctx->decctx, x0,y0, refList, value[0],value[1]); + tctx->motion.mvd[refList][0] = value[0]; + tctx->motion.mvd[refList][1] = value[1]; + + logtrace(LogSlice, "MVD[%d;%d|%d] = %d;%d\n",x0,y0,refList, value[0],value[1]); +} + + +void read_prediction_unit_SKIP(thread_context* tctx, + int x0, int y0, + int nPbW, int nPbH) +{ + int merge_idx = decode_merge_idx(tctx); + + tctx->motion.merge_idx = merge_idx; + tctx->motion.merge_flag = true; + + logtrace(LogSlice,"prediction skip 2Nx2N, merge_idx: %d\n",merge_idx); +} + + +/* xC/yC : CB position + xB/yB : position offset of the PB + nPbW/nPbH : size of PB + nCS : CB size + */ +void read_prediction_unit(thread_context* tctx, + int xC,int yC, int xB,int yB, + int nPbW, int nPbH, + int ctDepth, int nCS,int partIdx) +{ + logtrace(LogSlice,"read_prediction_unit %d;%d %dx%d\n",xC+xB,yC+xB,nPbW,nPbH); + + int x0 = xC+xB; + int y0 = yC+yB; + + slice_segment_header* shdr = tctx->shdr; + + int merge_flag = decode_merge_flag(tctx); + tctx->motion.merge_flag = merge_flag; + + if (merge_flag) { + int merge_idx = decode_merge_idx(tctx); + + logtrace(LogSlice,"prediction unit %d,%d, merge mode, index: %d\n",x0,y0,merge_idx); + + tctx->motion.merge_idx = merge_idx; + } + else { // no merge flag + enum InterPredIdc inter_pred_idc; + + if (shdr->slice_type == SLICE_TYPE_B) { + inter_pred_idc = decode_inter_pred_idc(tctx,x0,y0,nPbW,nPbH,ctDepth); + } + else { + inter_pred_idc = PRED_L0; + } + + tctx->motion.inter_pred_idc = inter_pred_idc; // set_inter_pred_idc(ctx,x0,y0, inter_pred_idc); + + if (inter_pred_idc != PRED_L1) { + int ref_idx_l0 = decode_ref_idx_lX(tctx, shdr->num_ref_idx_l0_active); + + // NOTE: case for only one reference frame is handles in decode_ref_idx_lX() + tctx->motion.refIdx[0] = ref_idx_l0; + + read_mvd_coding(tctx,x0,y0, 0); + + int mvp_l0_flag = decode_mvp_lx_flag(tctx); // l0 + tctx->motion.mvp_l0_flag = mvp_l0_flag; + + logtrace(LogSlice,"prediction unit %d,%d, L0, refIdx=%d mvp_l0_flag:%d\n", + x0,y0, tctx->motion.refIdx[0], mvp_l0_flag); + } + + if (inter_pred_idc != PRED_L0) { + int ref_idx_l1 = decode_ref_idx_lX(tctx, shdr->num_ref_idx_l1_active); + + // NOTE: case for only one reference frame is handles in decode_ref_idx_lX() + tctx->motion.refIdx[1] = ref_idx_l1; + + if (shdr->mvd_l1_zero_flag && + inter_pred_idc == PRED_BI) { + tctx->motion.mvd[1][0] = 0; + tctx->motion.mvd[1][1] = 0; + } + else { + read_mvd_coding(tctx,x0,y0, 1); + } + + int mvp_l1_flag = decode_mvp_lx_flag(tctx); // l1 + tctx->motion.mvp_l1_flag = mvp_l1_flag; + + logtrace(LogSlice,"prediction unit %d,%d, L1, refIdx=%d mvp_l1_flag:%d\n", + x0,y0, tctx->motion.refIdx[1], mvp_l1_flag); + } + } + + + + decode_prediction_unit(tctx->decctx, tctx->shdr, tctx->img, tctx->motion, + xC,yC,xB,yB, nCS, nPbW,nPbH, partIdx); +} + + + + +template +void read_pcm_samples_internal(thread_context* tctx, int x0, int y0, int log2CbSize, + int cIdx, bitreader& br) +{ + const seq_parameter_set& sps = tctx->img->get_sps(); + + int nPcmBits; + int bitDepth; + + int w = 1<0) { + w /= sps.SubWidthC; + h /= sps.SubHeightC; + + x0 /= sps.SubWidthC; + y0 /= sps.SubHeightC; + + nPcmBits = sps.pcm_sample_bit_depth_chroma; + bitDepth = sps.BitDepth_C; + } + else { + nPcmBits = sps.pcm_sample_bit_depth_luma; + bitDepth = sps.BitDepth_Y; + } + + pixel_t* ptr; + int stride; + ptr = tctx->img->get_image_plane_at_pos_NEW(cIdx,x0,y0); + stride = tctx->img->get_image_stride(cIdx); + + int shift = bitDepth - nPcmBits; + + for (int y=0;ycabac_decoder.bitstream_curr; + br.bytes_remaining = tctx->cabac_decoder.bitstream_end - tctx->cabac_decoder.bitstream_curr; + br.nextbits = 0; + br.nextbits_cnt = 0; + + + if (tctx->img->high_bit_depth(0)) { + read_pcm_samples_internal(tctx,x0,y0,log2CbSize,0,br); + } else { + read_pcm_samples_internal(tctx,x0,y0,log2CbSize,0,br); + } + + if (tctx->img->get_sps().ChromaArrayType != CHROMA_MONO) { + if (tctx->img->high_bit_depth(1)) { + read_pcm_samples_internal(tctx,x0,y0,log2CbSize,1,br); + read_pcm_samples_internal(tctx,x0,y0,log2CbSize,2,br); + } else { + read_pcm_samples_internal(tctx,x0,y0,log2CbSize,1,br); + read_pcm_samples_internal(tctx,x0,y0,log2CbSize,2,br); + } + } + + prepare_for_CABAC(&br); + tctx->cabac_decoder.bitstream_curr = br.data; + init_CABAC_decoder_2(&tctx->cabac_decoder); +} + + +int map_chroma_pred_mode(int intra_chroma_pred_mode, int IntraPredMode) +{ + if (intra_chroma_pred_mode==4) { + return IntraPredMode; + } + else { + static const enum IntraPredMode IntraPredModeCCand[4] = { + INTRA_PLANAR, + INTRA_ANGULAR_26, // vertical + INTRA_ANGULAR_10, // horizontal + INTRA_DC + }; + + int IntraPredModeC = IntraPredModeCCand[intra_chroma_pred_mode]; + if (IntraPredModeC == IntraPredMode) { + return INTRA_ANGULAR_34; + } + else { + return IntraPredModeC; + } + } +} + +// h.265-V2 Table 8-3 +static const uint8_t map_chroma_422[35] = { + 0,1,2, 2, 2, 2, 3, 5, 7, 8,10,12,13,15,17,18,19,20, + 21,22,23,23,24,24,25,25,26,27,27,28,28,29,29,30,31 +}; + +void read_coding_unit(thread_context* tctx, + int x0, int y0, // position of coding unit in frame + int log2CbSize, + int ctDepth) +{ + de265_image* img = tctx->img; + const seq_parameter_set& sps = img->get_sps(); + const pic_parameter_set& pps = img->get_pps(); + slice_segment_header* shdr = tctx->shdr; + + logtrace(LogSlice,"- read_coding_unit %d;%d cbsize:%d\n",x0,y0,1<set_log2CbSize(x0,y0, log2CbSize, true); + + /* This is only required on corrupted input streams. + It may happen that there are several slices in the image that overlap. + In this case, flags would accumulate from both slices. + */ + img->clear_split_transform_flags(x0,y0, log2CbSize); + + int nCbS = 1<cu_transquant_bypass_flag = transquant_bypass; + + if (transquant_bypass) { + img->set_cu_transquant_bypass(x0,y0,log2CbSize); + } + } + else { + tctx->cu_transquant_bypass_flag = 0; + } + + uint8_t cu_skip_flag = 0; + if (shdr->slice_type != SLICE_TYPE_I) { + cu_skip_flag = decode_cu_skip_flag(tctx,x0,y0,ctDepth); + } + + int IntraSplitFlag = 0; + + enum PredMode cuPredMode; + + if (cu_skip_flag) { + read_prediction_unit_SKIP(tctx,x0,y0,nCbS,nCbS); + + img->set_PartMode(x0,y0, PART_2Nx2N); // need this for deblocking filter + img->set_pred_mode(x0,y0,log2CbSize, MODE_SKIP); + cuPredMode = MODE_SKIP; + + logtrace(LogSlice,"CU pred mode: SKIP\n"); + + + // DECODE + + int nCS_L = 1<decctx,tctx->shdr,tctx->img,tctx->motion, + x0,y0, 0,0, nCS_L, nCS_L,nCS_L, 0); + } + else /* not skipped */ { + if (shdr->slice_type != SLICE_TYPE_I) { + int pred_mode_flag = decode_pred_mode_flag(tctx); + cuPredMode = pred_mode_flag ? MODE_INTRA : MODE_INTER; + } + else { + cuPredMode = MODE_INTRA; + } + + img->set_pred_mode(x0,y0,log2CbSize, cuPredMode); + + logtrace(LogSlice,"CU pred mode: %s\n", cuPredMode==MODE_INTRA ? "INTRA" : "INTER"); + + + enum PartMode PartMode; + + if (cuPredMode != MODE_INTRA || + log2CbSize == sps.Log2MinCbSizeY) { + PartMode = decode_part_mode(tctx, cuPredMode, log2CbSize); + + if (PartMode==PART_NxN && cuPredMode==MODE_INTRA) { + IntraSplitFlag=1; + } + } else { + PartMode = PART_2Nx2N; + } + + img->set_PartMode(x0,y0, PartMode); // needed for deblocking ? + + logtrace(LogSlice, "PartMode: %s\n", part_mode_name(PartMode)); + + + bool pcm_flag = false; + + if (cuPredMode == MODE_INTRA) { + if (PartMode == PART_2Nx2N && sps.pcm_enabled_flag && + log2CbSize >= sps.Log2MinIpcmCbSizeY && + log2CbSize <= sps.Log2MaxIpcmCbSizeY) { + pcm_flag = decode_CABAC_term_bit(&tctx->cabac_decoder); + } + + if (pcm_flag) { + img->set_pcm_flag(x0,y0,log2CbSize); + + read_pcm_samples(tctx, x0,y0, log2CbSize); + } + else { + int pbOffset = (PartMode == PART_NxN) ? (nCbS/2) : nCbS; + int log2IntraPredSize = (PartMode == PART_NxN) ? (log2CbSize-1) : log2CbSize; + + logtrace(LogSlice,"nCbS:%d pbOffset:%d\n",nCbS,pbOffset); + + int prev_intra_luma_pred_flag[4]; + + int idx=0; + for (int j=0;j0); // left candidate always available for right blk + int availableB = availableB0 || (j>0); // top candidate always available for bottom blk + + + + int PUidx = (x>>sps.Log2MinPUSize) + (y>>sps.Log2MinPUSize)*sps.PicWidthInMinPUs; + + enum IntraPredMode candModeList[3]; + + fillIntraPredModeCandidates(candModeList,x,y,PUidx, + availableA, availableB, img); + + for (int i=0;i<3;i++) + logtrace(LogSlice,"candModeList[%d] = %d\n", i, candModeList[i]); + + if (prev_intra_luma_pred_flag[idx]==1) { + IntraPredMode = candModeList[ mpm_idx[idx] ]; + } + else { + // sort candModeList + + if (candModeList[0] > candModeList[1]) { + std::swap(candModeList[0],candModeList[1]); + } + if (candModeList[0] > candModeList[2]) { + std::swap(candModeList[0],candModeList[2]); + } + if (candModeList[1] > candModeList[2]) { + std::swap(candModeList[1],candModeList[2]); + } + + // skip modes in the list + // (we have 35 modes. skipping the 3 in the list gives us 32, which can be selected by 5 bits) + IntraPredMode = rem_intra_luma_pred_mode[idx]; + for (int n=0;n<=2;n++) { + if (IntraPredMode >= candModeList[n]) { IntraPredMode++; } + } + } + + logtrace(LogSlice,"IntraPredMode[%d][%d] = %d (log2blk:%d)\n",x,y,IntraPredMode, log2IntraPredSize); + + img->set_IntraPredMode(PUidx, log2IntraPredSize, + (enum IntraPredMode)IntraPredMode); + + idx++; + } + + + // set chroma intra prediction mode + + if (sps.ChromaArrayType == CHROMA_444) { + // chroma 4:4:4 + + idx = 0; + for (int j=0;jget_IntraPredMode(x,y); + + int IntraPredModeC = map_chroma_pred_mode(intra_chroma_pred_mode, IntraPredMode); + + logtrace(LogSlice,"IntraPredModeC[%d][%d]: %d (blksize:%d)\n",x,y,IntraPredModeC, + 1<set_IntraPredModeC(x,y, log2IntraPredSize, + (enum IntraPredMode)IntraPredModeC, + intra_chroma_pred_mode == 4); + idx++; + } + } + else if (sps.ChromaArrayType != CHROMA_MONO) { + // chroma 4:2:0 and 4:2:2 + + int intra_chroma_pred_mode = decode_intra_chroma_pred_mode(tctx); + int IntraPredMode = img->get_IntraPredMode(x0,y0); + logtrace(LogSlice,"IntraPredMode: %d\n",IntraPredMode); + int IntraPredModeC = map_chroma_pred_mode(intra_chroma_pred_mode, IntraPredMode); + + if (sps.ChromaArrayType == CHROMA_422) { + IntraPredModeC = map_chroma_422[ IntraPredModeC ]; + } + + img->set_IntraPredModeC(x0,y0, log2CbSize, + (enum IntraPredMode)IntraPredModeC, + intra_chroma_pred_mode == 4); + } + } + } + else { // INTER + int nCS = 1<motion.merge_flag; // !!get_merge_flag(ctx,x0,y0); + + if (cuPredMode != MODE_INTRA && + !(PartMode == PART_2Nx2N && merge_flag)) { + + rqt_root_cbf = !!decode_rqt_root_cbf(tctx); + } + else { + /* rqt_root_cbf=1 is inferred for Inter blocks with 2Nx2N, merge mode. + These must be some residual data, because otherwise, the CB could + also be coded in SKIP mode. + */ + + rqt_root_cbf = true; + } + + //set_rqt_root_cbf(ctx,x0,y0, log2CbSize, rqt_root_cbf); + + if (rqt_root_cbf) { + int MaxTrafoDepth; + + if (cuPredMode==MODE_INTRA) { + MaxTrafoDepth = sps.max_transform_hierarchy_depth_intra + IntraSplitFlag; + } + else { + MaxTrafoDepth = sps.max_transform_hierarchy_depth_inter; + } + + logtrace(LogSlice,"MaxTrafoDepth: %d\n",MaxTrafoDepth); + + uint8_t initial_chroma_cbf = 1; + if (sps.ChromaArrayType == CHROMA_MONO) { + initial_chroma_cbf = 0; + } + + read_transform_tree(tctx, x0,y0, x0,y0, x0,y0, log2CbSize, 0,0, + MaxTrafoDepth, IntraSplitFlag, cuPredMode, + initial_chroma_cbf, initial_chroma_cbf); + } + } // !pcm + } +} + + +// ------------------------------------------------------------------------------------------ + + +void read_coding_quadtree(thread_context* tctx, + int x0, int y0, + int log2CbSize, + int ctDepth) +{ + logtrace(LogSlice,"- read_coding_quadtree %d;%d cbsize:%d depth:%d POC:%d\n",x0,y0,1<img->PicOrderCntVal); + + de265_image* img = tctx->img; + const seq_parameter_set& sps = img->get_sps(); + const pic_parameter_set& pps = img->get_pps(); + + int split_flag; + + // We only send a split flag if CU is larger than minimum size and + // completely contained within the image area. + // If it is partly outside the image area and not at minimum size, + // it is split. If already at minimum size, it is not split further. + if (x0+(1< sps.Log2MinCbSizeY) { + split_flag = decode_split_cu_flag(tctx, x0,y0, ctDepth); + } else { + if (log2CbSize > sps.Log2MinCbSizeY) { split_flag=1; } + else { split_flag=0; } + } + + + if (pps.cu_qp_delta_enabled_flag && + log2CbSize >= pps.Log2MinCuQpDeltaSize) + { + tctx->IsCuQpDeltaCoded = 0; + tctx->CuQpDelta = 0; + } + else + { + // shdr->CuQpDelta = 0; // TODO check: is this the right place to set to default value ? + } + + + if (tctx->shdr->cu_chroma_qp_offset_enabled_flag && + log2CbSize >= pps.Log2MinCuChromaQpOffsetSize) { + tctx->IsCuChromaQpOffsetCoded = 0; + } + + if (split_flag) { + int x1 = x0 + (1<<(log2CbSize-1)); + int y1 = y0 + (1<<(log2CbSize-1)); + + read_coding_quadtree(tctx,x0,y0, log2CbSize-1, ctDepth+1); + + if (x1set_ctDepth(x0,y0, log2CbSize, ctDepth); + + read_coding_unit(tctx, x0,y0, log2CbSize, ctDepth); + } + + logtrace(LogSlice,"-\n"); +} + + +// --------------------------------------------------------------------------- + +enum DecodeResult { + Decode_EndOfSliceSegment, + Decode_EndOfSubstream, + Decode_Error +}; + +/* Decode CTBs until the end of sub-stream, the end-of-slice, or some error occurs. + */ +enum DecodeResult decode_substream(thread_context* tctx, + bool block_wpp, // block on WPP dependencies + bool first_independent_substream) +{ + const pic_parameter_set& pps = tctx->img->get_pps(); + const seq_parameter_set& sps = tctx->img->get_sps(); + + const int ctbW = sps.PicWidthInCtbsY; + + + const int startCtbY = tctx->CtbY; + + //printf("start decoding substream at %d;%d\n",tctx->CtbX,tctx->CtbY); + + // in WPP mode: initialize CABAC model with stored model from row above + + if ((!first_independent_substream || tctx->CtbY != startCtbY) && + pps.entropy_coding_sync_enabled_flag && + tctx->CtbY>=1 && tctx->CtbX==0) + { + if (sps.PicWidthInCtbsY>1) { + if ((tctx->CtbY-1) >= tctx->imgunit->ctx_models.size()) { + return Decode_Error; + } + + //printf("CTX wait on %d/%d\n",1,tctx->CtbY-1); + + // we have to wait until the context model data is there + tctx->img->wait_for_progress(tctx->task, 1,tctx->CtbY-1,CTB_PROGRESS_PREFILTER); + + // copy CABAC model from previous CTB row + tctx->ctx_model = tctx->imgunit->ctx_models[(tctx->CtbY-1)]; + tctx->imgunit->ctx_models[(tctx->CtbY-1)].release(); // not used anymore + } + else { + tctx->img->wait_for_progress(tctx->task, 0,tctx->CtbY-1,CTB_PROGRESS_PREFILTER); + initialize_CABAC_models(tctx); + } + } + + + do { + const int ctbx = tctx->CtbX; + const int ctby = tctx->CtbY; + + if (ctbx+ctby*ctbW >= pps.CtbAddrRStoTS.size()) { + return Decode_Error; + } + + if (ctbx >= sps.PicWidthInCtbsY || + ctby >= sps.PicHeightInCtbsY) { + return Decode_Error; + } + + if (block_wpp && ctby>0 && ctbx < ctbW-1) { + + // TODO: if we are in tiles mode and at the right border, do not wait for x+1,y-1 + + //printf("wait on %d/%d (%d)\n",ctbx+1,ctby-1, ctbx+1+(ctby-1)*sps->PicWidthInCtbsY); + + tctx->img->wait_for_progress(tctx->task, ctbx+1,ctby-1, CTB_PROGRESS_PREFILTER); + } + + //printf("%p: decode %d;%d\n", tctx, tctx->CtbX,tctx->CtbY); + + + // read and decode CTB + + if (tctx->ctx_model.empty() == false) { + return Decode_Error; + } + + read_coding_tree_unit(tctx); + + + // save CABAC-model for WPP (except in last CTB row) + + if (pps.entropy_coding_sync_enabled_flag && + ctbx == 1 && + ctby < sps.PicHeightInCtbsY-1) + { + // no storage for context table has been allocated + if (tctx->imgunit->ctx_models.size() <= ctby) { + return Decode_Error; + } + + tctx->imgunit->ctx_models[ctby] = tctx->ctx_model; + tctx->imgunit->ctx_models[ctby].decouple(); // store an independent copy + } + + + // end of slice segment ? + + int end_of_slice_segment_flag = decode_CABAC_term_bit(&tctx->cabac_decoder); + //printf("end-of-slice flag: %d\n", end_of_slice_segment_flag); + + if (end_of_slice_segment_flag) { + // at the end of the slice segment, we store the CABAC model if we need it + // because a dependent slice may follow + + if (pps.dependent_slice_segments_enabled_flag) { + tctx->shdr->ctx_model_storage = tctx->ctx_model; + tctx->shdr->ctx_model_storage.decouple(); // store an independent copy + + tctx->shdr->ctx_model_storage_defined = true; + } + } + + tctx->img->ctb_progress[ctbx+ctby*ctbW].set_progress(CTB_PROGRESS_PREFILTER); + + //printf("%p: decoded %d|%d\n",tctx, ctby,ctbx); + + + logtrace(LogSlice,"read CTB %d -> end=%d\n", tctx->CtbAddrInRS, end_of_slice_segment_flag); + //printf("read CTB %d -> end=%d\n", tctx->CtbAddrInRS, end_of_slice_segment_flag); + + const int lastCtbY = tctx->CtbY; + + bool endOfPicture = advanceCtbAddr(tctx); // true if we read past the end of the image + + if (endOfPicture && + end_of_slice_segment_flag == false) + { + tctx->decctx->add_warning(DE265_WARNING_CTB_OUTSIDE_IMAGE_AREA, false); + tctx->img->integrity = INTEGRITY_DECODING_ERRORS; + return Decode_Error; + } + + + if (end_of_slice_segment_flag) { + /* corrupted inputs may send the end_of_slice_segment_flag even if not all + CTBs in a row have been coded. Hence, we mark all of them as finished. + */ + + /* + for (int x = ctbx+1 ; xPicWidthInCtbsY; x++) { + printf("mark skipped %d;%d\n",ctbx,ctby); + tctx->img->ctb_progress[ctbx+ctby*ctbW].set_progress(CTB_PROGRESS_PREFILTER); + } + */ + + return Decode_EndOfSliceSegment; + } + + + if (!end_of_slice_segment_flag) { + bool end_of_sub_stream = false; + end_of_sub_stream |= (pps.tiles_enabled_flag && + pps.TileId[tctx->CtbAddrInTS] != pps.TileId[tctx->CtbAddrInTS-1]); + end_of_sub_stream |= (pps.entropy_coding_sync_enabled_flag && + lastCtbY != tctx->CtbY); + + if (end_of_sub_stream) { + int end_of_sub_stream_one_bit = decode_CABAC_term_bit(&tctx->cabac_decoder); + if (!end_of_sub_stream_one_bit) { + tctx->decctx->add_warning(DE265_WARNING_EOSS_BIT_NOT_SET, false); + tctx->img->integrity = INTEGRITY_DECODING_ERRORS; + return Decode_Error; + } + + init_CABAC_decoder_2(&tctx->cabac_decoder); // byte alignment + return Decode_EndOfSubstream; + } + } + + } while (true); +} + + + +bool initialize_CABAC_at_slice_segment_start(thread_context* tctx) +{ + de265_image* img = tctx->img; + const pic_parameter_set& pps = img->get_pps(); + const seq_parameter_set& sps = img->get_sps(); + slice_segment_header* shdr = tctx->shdr; + + if (shdr->dependent_slice_segment_flag) { + int prevCtb = pps.CtbAddrTStoRS[ pps.CtbAddrRStoTS[shdr->slice_segment_address] -1 ]; + + int sliceIdx = img->get_SliceHeaderIndex_atIndex(prevCtb); + if (sliceIdx >= img->slices.size()) { + return false; + } + slice_segment_header* prevCtbHdr = img->slices[ sliceIdx ]; + + if (pps.is_tile_start_CTB(shdr->slice_segment_address % sps.PicWidthInCtbsY, + shdr->slice_segment_address / sps.PicWidthInCtbsY + )) { + initialize_CABAC_models(tctx); + } + else { + // wait for previous slice to finish decoding + + //printf("wait for previous slice to finish decoding\n"); + + + slice_unit* prevSliceSegment = tctx->imgunit->get_prev_slice_segment(tctx->sliceunit); + //assert(prevSliceSegment); + if (prevSliceSegment==NULL) { + return false; + } + + prevSliceSegment->finished_threads.wait_for_progress(prevSliceSegment->nThreads); + + + /* + printf("wait for %d,%d (init)\n", + prevCtb / sps->PicWidthInCtbsY, + prevCtb % sps->PicWidthInCtbsY); + tctx->img->wait_for_progress(tctx->task, prevCtb, CTB_PROGRESS_PREFILTER); + */ + + if (!prevCtbHdr->ctx_model_storage_defined) { + return false; + } + + tctx->ctx_model = prevCtbHdr->ctx_model_storage; + prevCtbHdr->ctx_model_storage.release(); + } + } + else { + initialize_CABAC_models(tctx); + } + + return true; +} + + +std::string thread_task_ctb_row::name() const { + char buf[100]; + sprintf(buf,"ctb-row-%d",debug_startCtbRow); + return buf; +} + + +std::string thread_task_slice_segment::name() const { + char buf[100]; + sprintf(buf,"slice-segment-%d;%d",debug_startCtbX,debug_startCtbY); + return buf; +} + + +void thread_task_slice_segment::work() +{ + thread_task_slice_segment* data = this; + thread_context* tctx = data->tctx; + de265_image* img = tctx->img; + + state = Running; + img->thread_run(this); + + setCtbAddrFromTS(tctx); + + //printf("%p: A start decoding at %d/%d\n", tctx, tctx->CtbX,tctx->CtbY); + + if (data->firstSliceSubstream) { + bool success = initialize_CABAC_at_slice_segment_start(tctx); + if (!success) { + state = Finished; + tctx->sliceunit->finished_threads.increase_progress(1); + img->thread_finishes(this); + return; + } + } + else { + initialize_CABAC_models(tctx); + } + + init_CABAC_decoder_2(&tctx->cabac_decoder); + + /*enum DecodeResult result =*/ decode_substream(tctx, false, data->firstSliceSubstream); + + state = Finished; + tctx->sliceunit->finished_threads.increase_progress(1); + img->thread_finishes(this); + + return; // DE265_OK; +} + + +void thread_task_ctb_row::work() +{ + thread_task_ctb_row* data = this; + thread_context* tctx = data->tctx; + de265_image* img = tctx->img; + + const seq_parameter_set& sps = img->get_sps(); + int ctbW = sps.PicWidthInCtbsY; + + state = Running; + img->thread_run(this); + + setCtbAddrFromTS(tctx); + + int ctby = tctx->CtbAddrInRS / ctbW; + int myCtbRow = ctby; + + //printf("start CTB-row decoding at row %d\n", ctby); + + if (data->firstSliceSubstream) { + bool success = initialize_CABAC_at_slice_segment_start(tctx); + if (!success) { + // could not decode this row, mark whole row as finished + for (int x=0;xctb_progress[myCtbRow*ctbW + x].set_progress(CTB_PROGRESS_PREFILTER); + } + + state = Finished; + tctx->sliceunit->finished_threads.increase_progress(1); + img->thread_finishes(this); + return; + } + //initialize_CABAC(tctx); + } + + init_CABAC_decoder_2(&tctx->cabac_decoder); + + bool firstIndependentSubstream = + data->firstSliceSubstream && !tctx->shdr->dependent_slice_segment_flag; + + /*enum DecodeResult result =*/ + decode_substream(tctx, true, firstIndependentSubstream); + + // mark progress on remaining CTBs in row (in case of decoder error and early termination) + + // TODO: what about slices that end properly in the middle of a CTB row? + + if (tctx->CtbY == myCtbRow) { + int lastCtbX = sps.PicWidthInCtbsY; // assume no tiles when WPP is on + for (int x = tctx->CtbX; xctb_progress[myCtbRow*ctbW + x].set_progress(CTB_PROGRESS_PREFILTER); + } + } + } + + state = Finished; + tctx->sliceunit->finished_threads.increase_progress(1); + img->thread_finishes(this); +} + + +de265_error read_slice_segment_data(thread_context* tctx) +{ + setCtbAddrFromTS(tctx); + + de265_image* img = tctx->img; + const pic_parameter_set& pps = img->get_pps(); + const seq_parameter_set& sps = img->get_sps(); + slice_segment_header* shdr = tctx->shdr; + + bool success = initialize_CABAC_at_slice_segment_start(tctx); + if (!success) { + return DE265_ERROR_UNSPECIFIED_DECODING_ERROR; + } + + init_CABAC_decoder_2(&tctx->cabac_decoder); + + //printf("-----\n"); + + bool first_slice_substream = !shdr->dependent_slice_segment_flag; + + int substream=0; + + enum DecodeResult result; + do { + int ctby = tctx->CtbY; + + + // check whether entry_points[] are correct in the bitstream + + if (substream>0) { + if (substream-1 >= tctx->shdr->entry_point_offset.size() || + tctx->cabac_decoder.bitstream_curr - tctx->cabac_decoder.bitstream_start -2 /* -2 because of CABAC init */ + != tctx->shdr->entry_point_offset[substream-1]) { + tctx->decctx->add_warning(DE265_WARNING_INCORRECT_ENTRY_POINT_OFFSET, true); + } + } + + substream++; + + + result = decode_substream(tctx, false, first_slice_substream); + + + if (result == Decode_EndOfSliceSegment || + result == Decode_Error) { + break; + } + + first_slice_substream = false; + + if (pps.tiles_enabled_flag) { + initialize_CABAC_models(tctx); + } + } while (true); + + return DE265_OK; +} + + +/* TODO: + When a task wants to block, but is the first in the list of pending tasks, + do some error concealment instead of blocking, since it will never be deblocked. + This will only happen in the case of input error. + */ diff --git a/sps.cc b/sps.cc new file mode 100644 index 0000000..476cdbb --- /dev/null +++ b/sps.cc @@ -0,0 +1,1298 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "sps.h" +#include "util.h" +#include "scan.h" +#include "decctx.h" + +#include +#include +#include + +#define READ_VLC_OFFSET(variable, vlctype, offset) \ + if ((vlc = get_ ## vlctype(br)) == UVLC_ERROR) { \ + errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); \ + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; \ + } \ + variable = vlc + offset; + +#define READ_VLC(variable, vlctype) READ_VLC_OFFSET(variable,vlctype,0) + + +static int SubWidthC_tab[] = { 1,2,2,1 }; +static int SubHeightC_tab[] = { 1,2,1,1 }; + + +// TODO if (!check_high(ctx, vlc, 15)) return false; +// TODO if (!check_ulvc(ctx, vlc)) return false; + + +// TODO: should be in some header-file of refpic.c +extern bool read_short_term_ref_pic_set(error_queue* errqueue, + const seq_parameter_set* sps, + bitreader* br, + ref_pic_set* out_set, + int idxRps, // index of the set to be read + const std::vector& sets, + bool sliceRefPicSet); + +extern bool write_short_term_ref_pic_set(error_queue* errqueue, + const seq_parameter_set* sps, + CABAC_encoder& out, + const ref_pic_set* in_set, // which set to write + int idxRps, // index of the set to be read + const std::vector& sets, // previously read sets + bool sliceRefPicSet); // is this in the slice header? + + +sps_range_extension::sps_range_extension() +{ + transform_skip_rotation_enabled_flag = 0; + transform_skip_context_enabled_flag = 0; + implicit_rdpcm_enabled_flag = 0; + explicit_rdpcm_enabled_flag = 0; + extended_precision_processing_flag = 0; + intra_smoothing_disabled_flag = 0; + high_precision_offsets_enabled_flag = 0; + persistent_rice_adaptation_enabled_flag = 0; + cabac_bypass_alignment_enabled_flag = 0; +} + + +seq_parameter_set::seq_parameter_set() +{ + // TODO: this is dangerous + //memset(this,0,sizeof(seq_parameter_set)); + + sps_read = false; + //ref_pic_sets = NULL; +} + + +seq_parameter_set::~seq_parameter_set() +{ + //free(ref_pic_sets); +} + + +void seq_parameter_set::set_defaults(enum PresetSet) +{ + video_parameter_set_id = 0; + sps_max_sub_layers = 1; + sps_temporal_id_nesting_flag = 1; + + profile_tier_level_.general.set_defaults(Profile_Main, 6,2); // TODO + + seq_parameter_set_id = 0; + chroma_format_idc = 1; + ChromaArrayType = chroma_format_idc; + + separate_colour_plane_flag = 0; + pic_width_in_luma_samples = 0; + pic_height_in_luma_samples = 0; + conformance_window_flag = 0; + + conf_win_left_offset = 0; + conf_win_right_offset = 0; + conf_win_top_offset = 0; + conf_win_bottom_offset = 0; + + bit_depth_luma =8; + bit_depth_chroma=8; + + log2_max_pic_order_cnt_lsb = 8; + sps_sub_layer_ordering_info_present_flag = 0; + + sps_max_dec_pic_buffering[0] = 1; + sps_max_num_reorder_pics[0] = 0; + sps_max_latency_increase_plus1[0] = 0; + + set_CB_log2size_range(4,4); + set_TB_log2size_range(3,4); + max_transform_hierarchy_depth_inter = 1; + max_transform_hierarchy_depth_intra = 1; + + scaling_list_enable_flag = 0; + sps_scaling_list_data_present_flag = 0; + + // TODO struct scaling_list_data scaling_list; + + amp_enabled_flag = 0; + sample_adaptive_offset_enabled_flag = 0; + pcm_enabled_flag = 0; + + pcm_sample_bit_depth_luma = 8; + pcm_sample_bit_depth_chroma = 8; + // TODO log2_min_pcm_luma_coding_block_size; + // TODO log2_diff_max_min_pcm_luma_coding_block_size; + pcm_loop_filter_disable_flag = 1; + + // num_short_term_ref_pic_sets = 0; + // std::vector ref_pic_sets; // [0 ; num_short_term_ref_pic_set (<=MAX_REF_PIC_SETS) ) + ref_pic_sets.clear(); + + long_term_ref_pics_present_flag = 0; + + num_long_term_ref_pics_sps = 0; + + /* TODO + int lt_ref_pic_poc_lsb_sps[MAX_NUM_LT_REF_PICS_SPS]; + char used_by_curr_pic_lt_sps_flag[MAX_NUM_LT_REF_PICS_SPS]; + */ + + sps_temporal_mvp_enabled_flag = 0; + strong_intra_smoothing_enable_flag = 0; + vui_parameters_present_flag = 0; + + /* + if( vui_parameters_present_flag ) + vui_parameters() + */ + + sps_extension_present_flag = 0; + sps_range_extension_flag = 0; + sps_multilayer_extension_flag = 0; + sps_extension_6bits = 0; +} + + +void seq_parameter_set::set_CB_log2size_range(int mini,int maxi) +{ + log2_min_luma_coding_block_size = mini; + log2_diff_max_min_luma_coding_block_size = maxi-mini; +} + + +void seq_parameter_set::set_TB_log2size_range(int mini,int maxi) +{ + log2_min_transform_block_size = mini; + log2_diff_max_min_transform_block_size = maxi-mini; +} + + +void seq_parameter_set::set_resolution(int w,int h) +{ + pic_width_in_luma_samples = w; + pic_height_in_luma_samples = h; +} + + +de265_error seq_parameter_set::read(error_queue* errqueue, bitreader* br) +{ + int vlc; + + video_parameter_set_id = get_bits(br,4); + sps_max_sub_layers = get_bits(br,3) +1; + if (sps_max_sub_layers>7) { + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + + sps_temporal_id_nesting_flag = get_bits(br,1); + + profile_tier_level_.read(br, sps_max_sub_layers); + + READ_VLC(seq_parameter_set_id, uvlc); + if (seq_parameter_set_id >= DE265_MAX_SPS_SETS) { + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + + + // --- decode chroma type --- + + READ_VLC(chroma_format_idc, uvlc); + + if (chroma_format_idc == 3) { + separate_colour_plane_flag = get_bits(br,1); + } + else { + separate_colour_plane_flag = 0; + } + + if (chroma_format_idc<0 || + chroma_format_idc>3) { + errqueue->add_warning(DE265_WARNING_INVALID_CHROMA_FORMAT, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + + + // --- picture size --- + + READ_VLC(pic_width_in_luma_samples, uvlc); + READ_VLC(pic_height_in_luma_samples, uvlc); + + if (pic_width_in_luma_samples == 0 || + pic_height_in_luma_samples == 0) { + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + + if (pic_width_in_luma_samples > MAX_PICTURE_WIDTH || + pic_height_in_luma_samples> MAX_PICTURE_HEIGHT) { + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + + conformance_window_flag = get_bits(br,1); + + if (conformance_window_flag) { + READ_VLC(conf_win_left_offset, uvlc); + READ_VLC(conf_win_right_offset, uvlc); + READ_VLC(conf_win_top_offset, uvlc); + READ_VLC(conf_win_bottom_offset,uvlc); + } + else { + conf_win_left_offset = 0; + conf_win_right_offset = 0; + conf_win_top_offset = 0; + conf_win_bottom_offset= 0; + } + + READ_VLC_OFFSET(bit_depth_luma, uvlc, 8); + READ_VLC_OFFSET(bit_depth_chroma,uvlc, 8); + if (bit_depth_luma > 16 || + bit_depth_chroma > 16) { + errqueue->add_warning(DE265_WARNING_SPS_HEADER_INVALID, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + + READ_VLC_OFFSET(log2_max_pic_order_cnt_lsb, uvlc, 4); + if (log2_max_pic_order_cnt_lsb<4 || + log2_max_pic_order_cnt_lsb>16) { + errqueue->add_warning(DE265_WARNING_SPS_HEADER_INVALID, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + MaxPicOrderCntLsb = 1<<(log2_max_pic_order_cnt_lsb); + + + // --- sub_layer_ordering_info --- + + sps_sub_layer_ordering_info_present_flag = get_bits(br,1); + + int firstLayer = (sps_sub_layer_ordering_info_present_flag ? + 0 : sps_max_sub_layers-1 ); + + for (int i=firstLayer ; i <= sps_max_sub_layers-1; i++ ) { + + // sps_max_dec_pic_buffering[i] + + vlc=get_uvlc(br); + if (vlc == UVLC_ERROR || + vlc+1 > MAX_NUM_REF_PICS) { + errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + + sps_max_dec_pic_buffering[i] = vlc+1; + + // sps_max_num_reorder_pics[i] + + READ_VLC(sps_max_num_reorder_pics[i], uvlc); + + + // sps_max_latency_increase[i] + + READ_VLC(sps_max_latency_increase_plus1[i], uvlc); + + SpsMaxLatencyPictures[i] = (sps_max_num_reorder_pics[i] + + sps_max_latency_increase_plus1[i]-1); + } + + // copy info to all layers if only specified once + + if (sps_sub_layer_ordering_info_present_flag) { + int ref = sps_max_sub_layers-1; + assert(ref<7); + + for (int i=0 ; i < sps_max_sub_layers-1; i++ ) { + sps_max_dec_pic_buffering[i] = sps_max_dec_pic_buffering[ref]; + sps_max_num_reorder_pics[i] = sps_max_num_reorder_pics[ref]; + sps_max_latency_increase_plus1[i] = sps_max_latency_increase_plus1[ref]; + } + } + + + READ_VLC_OFFSET(log2_min_luma_coding_block_size, uvlc, 3); + READ_VLC (log2_diff_max_min_luma_coding_block_size, uvlc); + READ_VLC_OFFSET(log2_min_transform_block_size, uvlc, 2); + READ_VLC(log2_diff_max_min_transform_block_size, uvlc); + READ_VLC(max_transform_hierarchy_depth_inter, uvlc); + READ_VLC(max_transform_hierarchy_depth_intra, uvlc); + + if (log2_min_luma_coding_block_size > 6) { return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } + if (log2_min_luma_coding_block_size + log2_diff_max_min_luma_coding_block_size > 6) { return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } + if (log2_min_transform_block_size > 5) { return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } + if (log2_min_transform_block_size + log2_diff_max_min_transform_block_size > 5) { return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } + + scaling_list_enable_flag = get_bits(br,1); + + if (scaling_list_enable_flag) { + + sps_scaling_list_data_present_flag = get_bits(br,1); + if (sps_scaling_list_data_present_flag) { + + de265_error err; + if ((err=read_scaling_list(br,this, &scaling_list, false)) != DE265_OK) { + return err; + } + } + else { + set_default_scaling_lists(&scaling_list); + } + } + + amp_enabled_flag = get_bits(br,1); + sample_adaptive_offset_enabled_flag = get_bits(br,1); + pcm_enabled_flag = get_bits(br,1); + if (pcm_enabled_flag) { + pcm_sample_bit_depth_luma = get_bits(br,4)+1; + pcm_sample_bit_depth_chroma = get_bits(br,4)+1; + READ_VLC_OFFSET(log2_min_pcm_luma_coding_block_size, uvlc, 3); + READ_VLC(log2_diff_max_min_pcm_luma_coding_block_size, uvlc); + pcm_loop_filter_disable_flag = get_bits(br,1); + } + else { + pcm_sample_bit_depth_luma = 0; + pcm_sample_bit_depth_chroma = 0; + log2_min_pcm_luma_coding_block_size = 0; + log2_diff_max_min_pcm_luma_coding_block_size = 0; + pcm_loop_filter_disable_flag = 0; + } + + int num_short_term_ref_pic_sets; + READ_VLC(num_short_term_ref_pic_sets, uvlc); + if (num_short_term_ref_pic_sets < 0 || + num_short_term_ref_pic_sets > 64) { + errqueue->add_warning(DE265_WARNING_NUMBER_OF_SHORT_TERM_REF_PIC_SETS_OUT_OF_RANGE, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + + // --- allocate reference pic set --- + + // we do not allocate the ref-pic-set for the slice header here, but in the slice header itself + + ref_pic_sets.resize(num_short_term_ref_pic_sets); + + for (int i = 0; i < num_short_term_ref_pic_sets; i++) { + + bool success = read_short_term_ref_pic_set(errqueue,this,br, + &ref_pic_sets[i], i, + ref_pic_sets, + false); + + if (!success) { + return DE265_WARNING_SPS_HEADER_INVALID; + } + + // dump_short_term_ref_pic_set(&(*ref_pic_sets)[i], fh); + } + + long_term_ref_pics_present_flag = get_bits(br,1); + + if (long_term_ref_pics_present_flag) { + + READ_VLC(num_long_term_ref_pics_sps, uvlc); + if (num_long_term_ref_pics_sps > MAX_NUM_LT_REF_PICS_SPS) { + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + + for (int i = 0; i < num_long_term_ref_pics_sps; i++ ) { + lt_ref_pic_poc_lsb_sps[i] = get_bits(br, log2_max_pic_order_cnt_lsb); + used_by_curr_pic_lt_sps_flag[i] = get_bits(br,1); + } + } + else { + num_long_term_ref_pics_sps = 0; // NOTE: missing definition in standard ! + } + + sps_temporal_mvp_enabled_flag = get_bits(br,1); + strong_intra_smoothing_enable_flag = get_bits(br,1); + + vui_parameters_present_flag = get_bits(br,1); + if (vui_parameters_present_flag) { + vui.read(errqueue, br, this); + } + + + sps_extension_present_flag = get_bits(br,1); + if (sps_extension_present_flag) { + sps_range_extension_flag = get_bits(br,1); + sps_multilayer_extension_flag = get_bits(br,1); + sps_extension_6bits = get_bits(br,6); + } + else { + sps_range_extension_flag = 0; + } + + if (sps_range_extension_flag) { + de265_error err = range_extension.read(errqueue, br); + if (err != DE265_OK) { return err; } + } + + /* + sps_extension_flag = get_bits(br,1); + if (sps_extension_flag) { + assert(false); + } + */ + + + de265_error err = compute_derived_values(); + if (err != DE265_OK) { return err; } + + sps_read = true; + + return DE265_OK; +} + + +de265_error seq_parameter_set::compute_derived_values(bool sanitize_values) +{ + // --- compute derived values --- + + SubWidthC = SubWidthC_tab [chroma_format_idc]; + SubHeightC = SubHeightC_tab[chroma_format_idc]; + + if (separate_colour_plane_flag) { + ChromaArrayType = 0; + } + else { + ChromaArrayType = chroma_format_idc; + } + + if (ChromaArrayType==0) { + WinUnitX = 1; + WinUnitY = 1; + } + else { + WinUnitX = SubWidthC_tab [chroma_format_idc]; + WinUnitY = SubHeightC_tab[chroma_format_idc]; + } + + + + BitDepth_Y = bit_depth_luma; + QpBdOffset_Y = 6*(bit_depth_luma-8); + BitDepth_C = bit_depth_chroma; + QpBdOffset_C = 6*(bit_depth_chroma-8); + + Log2MinCbSizeY = log2_min_luma_coding_block_size; + Log2CtbSizeY = Log2MinCbSizeY + log2_diff_max_min_luma_coding_block_size; + MinCbSizeY = 1 << Log2MinCbSizeY; + CtbSizeY = 1 << Log2CtbSizeY; + + PicWidthInMinCbsY = ceil_div(pic_width_in_luma_samples, MinCbSizeY); + PicWidthInCtbsY = ceil_div(pic_width_in_luma_samples, CtbSizeY); + PicHeightInMinCbsY = ceil_div(pic_height_in_luma_samples, MinCbSizeY); + PicHeightInCtbsY = ceil_div(pic_height_in_luma_samples,CtbSizeY); + PicSizeInMinCbsY = PicWidthInMinCbsY * PicHeightInMinCbsY; + PicSizeInCtbsY = PicWidthInCtbsY * PicHeightInCtbsY; + PicSizeInSamplesY = pic_width_in_luma_samples * pic_height_in_luma_samples; + + if (chroma_format_idc==0 || separate_colour_plane_flag) { + CtbWidthC = 0; + CtbHeightC = 0; + } + else { + CtbWidthC = CtbSizeY / SubWidthC; + CtbHeightC = CtbSizeY / SubHeightC; + } + + Log2MinTrafoSize = log2_min_transform_block_size; + Log2MaxTrafoSize = log2_min_transform_block_size + log2_diff_max_min_transform_block_size; + + if (max_transform_hierarchy_depth_inter > Log2CtbSizeY - Log2MinTrafoSize) { + if (sanitize_values) { + max_transform_hierarchy_depth_inter = Log2CtbSizeY - Log2MinTrafoSize; + } else { + fprintf(stderr,"SPS error: transform hierarchy depth (inter) > CTB size - min TB size\n"); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + } + + if (max_transform_hierarchy_depth_intra > Log2CtbSizeY - Log2MinTrafoSize) { + if (sanitize_values) { + max_transform_hierarchy_depth_intra = Log2CtbSizeY - Log2MinTrafoSize; + } else { + fprintf(stderr,"SPS error: transform hierarchy depth (intra) > CTB size - min TB size\n"); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + } + + + if (sanitize_values) { + if (max_transform_hierarchy_depth_inter < Log2CtbSizeY - Log2MaxTrafoSize) { + max_transform_hierarchy_depth_inter = Log2CtbSizeY - Log2MaxTrafoSize; + } + + if (max_transform_hierarchy_depth_intra < Log2CtbSizeY - Log2MaxTrafoSize) { + max_transform_hierarchy_depth_intra = Log2CtbSizeY - Log2MaxTrafoSize; + } + } + + + Log2MinPUSize = Log2MinCbSizeY-1; + PicWidthInMinPUs = PicWidthInCtbsY << (Log2CtbSizeY - Log2MinPUSize); + PicHeightInMinPUs = PicHeightInCtbsY << (Log2CtbSizeY - Log2MinPUSize); + + Log2MinIpcmCbSizeY = log2_min_pcm_luma_coding_block_size; + Log2MaxIpcmCbSizeY = (log2_min_pcm_luma_coding_block_size + + log2_diff_max_min_pcm_luma_coding_block_size); + + // the following are not in the standard + PicWidthInTbsY = PicWidthInCtbsY << (Log2CtbSizeY - Log2MinTrafoSize); + PicHeightInTbsY = PicHeightInCtbsY << (Log2CtbSizeY - Log2MinTrafoSize); + PicSizeInTbsY = PicWidthInTbsY * PicHeightInTbsY; + + + if (range_extension.high_precision_offsets_enabled_flag) { + WpOffsetBdShiftY = 0; + WpOffsetBdShiftC = 0; + WpOffsetHalfRangeY = 1 << (BitDepth_Y - 1); + WpOffsetHalfRangeC = 1 << (BitDepth_C - 1); + } + else { + WpOffsetBdShiftY = ( BitDepth_Y - 8 ); + WpOffsetBdShiftC = ( BitDepth_C - 8 ); + WpOffsetHalfRangeY = 1 << 7; + WpOffsetHalfRangeC = 1 << 7; + } + + + // --- check SPS sanity --- + + if (pic_width_in_luma_samples % MinCbSizeY != 0 || + pic_height_in_luma_samples % MinCbSizeY != 0) { + // TODO: warn that image size is coded wrong in bitstream (must be multiple of MinCbSizeY) + fprintf(stderr,"SPS error: CB alignment\n"); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + + if (Log2MinTrafoSize > Log2MinCbSizeY) { + fprintf(stderr,"SPS error: TB > CB\n"); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + + if (Log2MaxTrafoSize > libde265_min(Log2CtbSizeY,5)) { + fprintf(stderr,"SPS error: TB_max > 32 or CTB\n"); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + + + if (BitDepth_Y < 8 || BitDepth_Y > 16) { + fprintf(stderr,"SPS error: bitdepth Y not in [8;16]\n"); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + + if (BitDepth_C < 8 || BitDepth_C > 16) { + fprintf(stderr,"SPS error: bitdepth C not in [8;16]\n"); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + + + sps_read = true; + + return DE265_OK; +} + + + +void seq_parameter_set::dump(int fd) const +{ + //#if (_MSC_VER >= 1500) + //#define LOG0(t) loginfo(LogHeaders, t) + //#define LOG1(t,d) loginfo(LogHeaders, t,d) + //#define LOG2(t,d1,d2) loginfo(LogHeaders, t,d1,d2) + //#define LOG3(t,d1,d2,d3) loginfo(LogHeaders, t,d1,d2,d3) + + FILE* fh; + if (fd==1) fh=stdout; + else if (fd==2) fh=stderr; + else { return; } + +#define LOG0(t) log2fh(fh, t) +#define LOG1(t,d) log2fh(fh, t,d) +#define LOG2(t,d1,d2) log2fh(fh, t,d1,d2) +#define LOG3(t,d1,d2,d3) log2fh(fh, t,d1,d2,d3) + + + LOG0("----------------- SPS -----------------\n"); + LOG1("video_parameter_set_id : %d\n", video_parameter_set_id); + LOG1("sps_max_sub_layers : %d\n", sps_max_sub_layers); + LOG1("sps_temporal_id_nesting_flag : %d\n", sps_temporal_id_nesting_flag); + + profile_tier_level_.dump(sps_max_sub_layers, fh); + + LOG1("seq_parameter_set_id : %d\n", seq_parameter_set_id); + LOG2("chroma_format_idc : %d (%s)\n", chroma_format_idc, + chroma_format_idc == 0 ? "monochrome" : + chroma_format_idc == 1 ? "4:2:0" : + chroma_format_idc == 2 ? "4:2:2" : + chroma_format_idc == 3 ? "4:4:4" : "unknown"); + + if (chroma_format_idc == 3) { + LOG1("separate_colour_plane_flag : %d\n", separate_colour_plane_flag); + } + + LOG1("pic_width_in_luma_samples : %d\n", pic_width_in_luma_samples); + LOG1("pic_height_in_luma_samples : %d\n", pic_height_in_luma_samples); + LOG1("conformance_window_flag : %d\n", conformance_window_flag); + + if (conformance_window_flag) { + LOG1("conf_win_left_offset : %d\n", conf_win_left_offset); + LOG1("conf_win_right_offset : %d\n", conf_win_right_offset); + LOG1("conf_win_top_offset : %d\n", conf_win_top_offset); + LOG1("conf_win_bottom_offset: %d\n", conf_win_bottom_offset); + } + + LOG1("bit_depth_luma : %d\n", bit_depth_luma); + LOG1("bit_depth_chroma : %d\n", bit_depth_chroma); + + LOG1("log2_max_pic_order_cnt_lsb : %d\n", log2_max_pic_order_cnt_lsb); + LOG1("sps_sub_layer_ordering_info_present_flag : %d\n", sps_sub_layer_ordering_info_present_flag); + + int firstLayer = (sps_sub_layer_ordering_info_present_flag ? + 0 : sps_max_sub_layers-1 ); + + for (int i=firstLayer ; i <= sps_max_sub_layers-1; i++ ) { + LOG1("Layer %d\n",i); + LOG1(" sps_max_dec_pic_buffering : %d\n", sps_max_dec_pic_buffering[i]); + LOG1(" sps_max_num_reorder_pics : %d\n", sps_max_num_reorder_pics[i]); + LOG1(" sps_max_latency_increase_plus1 : %d\n", sps_max_latency_increase_plus1[i]); + } + + LOG1("log2_min_luma_coding_block_size : %d\n", log2_min_luma_coding_block_size); + LOG1("log2_diff_max_min_luma_coding_block_size : %d\n",log2_diff_max_min_luma_coding_block_size); + LOG1("log2_min_transform_block_size : %d\n", log2_min_transform_block_size); + LOG1("log2_diff_max_min_transform_block_size : %d\n", log2_diff_max_min_transform_block_size); + LOG1("max_transform_hierarchy_depth_inter : %d\n", max_transform_hierarchy_depth_inter); + LOG1("max_transform_hierarchy_depth_intra : %d\n", max_transform_hierarchy_depth_intra); + LOG1("scaling_list_enable_flag : %d\n", scaling_list_enable_flag); + + if (scaling_list_enable_flag) { + + LOG1("sps_scaling_list_data_present_flag : %d\n", sps_scaling_list_data_present_flag); + if (sps_scaling_list_data_present_flag) { + + LOG0("scaling list logging output not implemented"); + //assert(0); + //scaling_list_data() + } + } + + LOG1("amp_enabled_flag : %d\n", amp_enabled_flag); + LOG1("sample_adaptive_offset_enabled_flag : %d\n", sample_adaptive_offset_enabled_flag); + LOG1("pcm_enabled_flag : %d\n", pcm_enabled_flag); + + if (pcm_enabled_flag) { + LOG1("pcm_sample_bit_depth_luma : %d\n", pcm_sample_bit_depth_luma); + LOG1("pcm_sample_bit_depth_chroma : %d\n", pcm_sample_bit_depth_chroma); + LOG1("log2_min_pcm_luma_coding_block_size : %d\n", log2_min_pcm_luma_coding_block_size); + LOG1("log2_diff_max_min_pcm_luma_coding_block_size : %d\n", log2_diff_max_min_pcm_luma_coding_block_size); + LOG1("pcm_loop_filter_disable_flag : %d\n", pcm_loop_filter_disable_flag); + } + + LOG1("num_short_term_ref_pic_sets : %d\n", ref_pic_sets.size()); + + for (int i = 0; i < ref_pic_sets.size(); i++) { + LOG1("ref_pic_set[ %2d ]: ",i); + dump_compact_short_term_ref_pic_set(&ref_pic_sets[i], 16, fh); + } + + LOG1("long_term_ref_pics_present_flag : %d\n", long_term_ref_pics_present_flag); + + if (long_term_ref_pics_present_flag) { + + LOG1("num_long_term_ref_pics_sps : %d\n", num_long_term_ref_pics_sps); + + for (int i = 0; i < num_long_term_ref_pics_sps; i++ ) { + LOG3("lt_ref_pic_poc_lsb_sps[%d] : %d (used_by_curr_pic_lt_sps_flag=%d)\n", + i, lt_ref_pic_poc_lsb_sps[i], used_by_curr_pic_lt_sps_flag[i]); + } + } + + LOG1("sps_temporal_mvp_enabled_flag : %d\n", sps_temporal_mvp_enabled_flag); + LOG1("strong_intra_smoothing_enable_flag : %d\n", strong_intra_smoothing_enable_flag); + LOG1("vui_parameters_present_flag : %d\n", vui_parameters_present_flag); + + LOG1("sps_extension_present_flag : %d\n", sps_extension_present_flag); + LOG1("sps_range_extension_flag : %d\n", sps_range_extension_flag); + LOG1("sps_multilayer_extension_flag : %d\n", sps_multilayer_extension_flag); + LOG1("sps_extension_6bits : %d\n", sps_extension_6bits); + + LOG1("CtbSizeY : %d\n", CtbSizeY); + LOG1("MinCbSizeY : %d\n", MinCbSizeY); + LOG1("MaxCbSizeY : %d\n", 1<<(log2_min_luma_coding_block_size + log2_diff_max_min_luma_coding_block_size)); + LOG1("MinTBSizeY : %d\n", 1< matrixId) { + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + + //printf("scaling_list_pred_matrix_id_delta=%d\n", scaling_list_pred_matrix_id_delta); + + dc_coeff[sizeId][matrixId] = 16; + scaling_list_dc_coef = 16; + + if (scaling_list_pred_matrix_id_delta==0) { + if (sizeId==0) { + memcpy(curr_scaling_list, default_ScalingList_4x4, 16); + } + else { + if (canonicalMatrixId<3) + { memcpy(curr_scaling_list, default_ScalingList_8x8_intra,64); } + else + { memcpy(curr_scaling_list, default_ScalingList_8x8_inter,64); } + } + } + else { + // TODO: CHECK: for sizeID=3 and the second matrix, should we have delta=1 or delta=3 ? + if (sizeId==3) { assert(scaling_list_pred_matrix_id_delta==1); } + + int mID = matrixId - scaling_list_pred_matrix_id_delta; + + int len = (sizeId == 0 ? 16 : 64); + memcpy(curr_scaling_list, scaling_list[mID], len); + + scaling_list_dc_coef = dc_coeff[sizeId][mID]; + dc_coeff[sizeId][matrixId] = dc_coeff[sizeId][mID]; + } + } + else { + int nextCoef=8; + int coefNum = (sizeId==0 ? 16 : 64); + if (sizeId>1) { + scaling_list_dc_coef = get_svlc(br); + if (scaling_list_dc_coef < -7 || + scaling_list_dc_coef > 247) { + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + + scaling_list_dc_coef += 8; + nextCoef=scaling_list_dc_coef; + dc_coeff[sizeId][matrixId] = scaling_list_dc_coef; + } + else { + scaling_list_dc_coef = 16; + } + //printf("DC = %d\n",scaling_list_dc_coef); + + for (int i=0;i 127) { + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + + nextCoef = (nextCoef + scaling_list_delta_coef + 256) % 256; + curr_scaling_list[i] = nextCoef; + //printf("curr %d = %d\n",i,nextCoef); + } + } + + + // --- generate ScalingFactor arrays --- + + switch (sizeId) { + case 0: + fill_scaling_factor(&sclist->ScalingFactor_Size0[matrixId][0][0], curr_scaling_list, 0); + break; + + case 1: + fill_scaling_factor(&sclist->ScalingFactor_Size1[matrixId][0][0], curr_scaling_list, 1); + break; + + case 2: + fill_scaling_factor(&sclist->ScalingFactor_Size2[matrixId][0][0], curr_scaling_list, 2); + sclist->ScalingFactor_Size2[matrixId][0][0] = scaling_list_dc_coef; + //printf("DC coeff: %d\n", scaling_list_dc_coef); + break; + + case 3: + fill_scaling_factor(&sclist->ScalingFactor_Size3[matrixId][0][0], curr_scaling_list, 3); + sclist->ScalingFactor_Size3[matrixId][0][0] = scaling_list_dc_coef; + //printf("DC coeff: %d\n", scaling_list_dc_coef); + break; + } + } + } + + return DE265_OK; +} + + +de265_error write_scaling_list(CABAC_encoder& out, const seq_parameter_set* sps, + scaling_list_data* sclist, bool inPPS) +{ + assert(false); + // TODO + + return DE265_OK; +} + + +void set_default_scaling_lists(scaling_list_data* sclist) +{ + // 4x4 + + for (int matrixId=0;matrixId<6;matrixId++) { + fill_scaling_factor(&sclist->ScalingFactor_Size0[matrixId][0][0], + default_ScalingList_4x4, 0); + } + + // 8x8 + + for (int matrixId=0;matrixId<3;matrixId++) { + fill_scaling_factor(&sclist->ScalingFactor_Size1[matrixId+0][0][0], + default_ScalingList_8x8_intra, 1); + fill_scaling_factor(&sclist->ScalingFactor_Size1[matrixId+3][0][0], + default_ScalingList_8x8_inter, 1); + } + + // 16x16 + + for (int matrixId=0;matrixId<3;matrixId++) { + fill_scaling_factor(&sclist->ScalingFactor_Size2[matrixId+0][0][0], + default_ScalingList_8x8_intra, 2); + fill_scaling_factor(&sclist->ScalingFactor_Size2[matrixId+3][0][0], + default_ScalingList_8x8_inter, 2); + } + + // 32x32 + + fill_scaling_factor(&sclist->ScalingFactor_Size3[0][0][0], + default_ScalingList_8x8_intra, 3); + fill_scaling_factor(&sclist->ScalingFactor_Size3[1][0][0], + default_ScalingList_8x8_inter, 3); +} + + +de265_error seq_parameter_set::write(error_queue* errqueue, CABAC_encoder& out) +{ + out.write_bits(video_parameter_set_id, 4); + if (sps_max_sub_layers>7) { + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + out.write_bits(sps_max_sub_layers-1, 3); + + out.write_bit(sps_temporal_id_nesting_flag); + + profile_tier_level_.write(out, sps_max_sub_layers); + + out.write_uvlc(seq_parameter_set_id); + + + // --- encode chroma type --- + + out.write_uvlc(chroma_format_idc); + + if (chroma_format_idc<0 || + chroma_format_idc>3) { + errqueue->add_warning(DE265_WARNING_INVALID_CHROMA_FORMAT, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + + if (chroma_format_idc == 3) { + out.write_bit(separate_colour_plane_flag); + } + + + // --- picture size --- + + out.write_uvlc(pic_width_in_luma_samples); + out.write_uvlc(pic_height_in_luma_samples); + + out.write_bit(conformance_window_flag); + + if (conformance_window_flag) { + out.write_uvlc(conf_win_left_offset); + out.write_uvlc(conf_win_right_offset); + out.write_uvlc(conf_win_top_offset); + out.write_uvlc(conf_win_bottom_offset); + } + + + out.write_uvlc(bit_depth_luma-8); + out.write_uvlc(bit_depth_chroma-8); + + out.write_uvlc(log2_max_pic_order_cnt_lsb-4); + + + // --- sub_layer_ordering_info --- + + out.write_bit(sps_sub_layer_ordering_info_present_flag); + + int firstLayer = (sps_sub_layer_ordering_info_present_flag ? + 0 : sps_max_sub_layers-1 ); + + for (int i=firstLayer ; i <= sps_max_sub_layers-1; i++ ) { + + // sps_max_dec_pic_buffering[i] + + if (sps_max_dec_pic_buffering[i] > MAX_NUM_REF_PICS) { + errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + + out.write_uvlc(sps_max_dec_pic_buffering[i]-1); + + // sps_max_num_reorder_pics[i] + + out.write_uvlc(sps_max_num_reorder_pics[i]); + + + // sps_max_latency_increase[i] + + out.write_uvlc(sps_max_latency_increase_plus1[i]); + } + + + out.write_uvlc(log2_min_luma_coding_block_size-3); + out.write_uvlc(log2_diff_max_min_luma_coding_block_size); + out.write_uvlc(log2_min_transform_block_size-2); + out.write_uvlc(log2_diff_max_min_transform_block_size); + out.write_uvlc(max_transform_hierarchy_depth_inter); + out.write_uvlc(max_transform_hierarchy_depth_intra); + out.write_bit(scaling_list_enable_flag); + + if (scaling_list_enable_flag) { + + out.write_bit(sps_scaling_list_data_present_flag); + if (sps_scaling_list_data_present_flag) { + + de265_error err; + if ((err=write_scaling_list(out,this, &scaling_list, false)) != DE265_OK) { + return err; + } + } + } + + out.write_bit(amp_enabled_flag); + out.write_bit(sample_adaptive_offset_enabled_flag); + out.write_bit(pcm_enabled_flag); + if (pcm_enabled_flag) { + out.write_bits(pcm_sample_bit_depth_luma -1,4); + out.write_bits(pcm_sample_bit_depth_chroma-1,4); + out.write_uvlc(log2_min_pcm_luma_coding_block_size-3); + out.write_uvlc(log2_diff_max_min_pcm_luma_coding_block_size); + out.write_bit(pcm_loop_filter_disable_flag); + } + + int num_short_term_ref_pic_sets = ref_pic_sets.size(); + if (num_short_term_ref_pic_sets < 0 || + num_short_term_ref_pic_sets > 64) { + errqueue->add_warning(DE265_WARNING_NUMBER_OF_SHORT_TERM_REF_PIC_SETS_OUT_OF_RANGE, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + out.write_uvlc(num_short_term_ref_pic_sets); + + // --- allocate reference pic set --- + + // we do not allocate the ref-pic-set for the slice header here, but in the slice header itself + + for (int i = 0; i < num_short_term_ref_pic_sets; i++) { + + bool success = write_short_term_ref_pic_set(errqueue,this,out, + &ref_pic_sets[i], i, + ref_pic_sets, + false); + + if (!success) { + return DE265_WARNING_SPS_HEADER_INVALID; + } + + // dump_short_term_ref_pic_set(&(*ref_pic_sets)[i], fh); + } + + out.write_bit(long_term_ref_pics_present_flag); + + if (long_term_ref_pics_present_flag) { + + if (num_long_term_ref_pics_sps > MAX_NUM_LT_REF_PICS_SPS) { + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + out.write_uvlc(num_long_term_ref_pics_sps); + + for (int i = 0; i < num_long_term_ref_pics_sps; i++ ) { + out.write_bits(lt_ref_pic_poc_lsb_sps[i], log2_max_pic_order_cnt_lsb); + out.write_bit (used_by_curr_pic_lt_sps_flag[i]); + } + } + + out.write_bit(sps_temporal_mvp_enabled_flag); + out.write_bit(strong_intra_smoothing_enable_flag); + out.write_bit(vui_parameters_present_flag); + +#if 0 + if (vui_parameters_present_flag) { + assert(false); + /* + vui_parameters() + sps_extension_flag + u(1) + if( sps_extension_flag ) + while( more_rbsp_data() ) + sps_extension_data_flag + u(1) + rbsp_trailing_bits() + */ + } +#endif + + out.write_bit(sps_extension_present_flag); + +#if 0 + if (sps_extension_flag) { + assert(false); + } + check_rbsp_trailing_bits(br); +#endif + + // --- compute derived values --- + +#if 0 + BitDepth_Y = bit_depth_luma; + QpBdOffset_Y = 6*(bit_depth_luma-8); + BitDepth_C = bit_depth_chroma; + QpBdOffset_C = 6*(bit_depth_chroma-8); + Log2MinCbSizeY = log2_min_luma_coding_block_size; + Log2CtbSizeY = Log2MinCbSizeY + log2_diff_max_min_luma_coding_block_size; + MinCbSizeY = 1 << Log2MinCbSizeY; + CtbSizeY = 1 << Log2CtbSizeY; + PicWidthInMinCbsY = pic_width_in_luma_samples / MinCbSizeY; + PicWidthInCtbsY = ceil_div(pic_width_in_luma_samples, CtbSizeY); + PicHeightInMinCbsY = pic_height_in_luma_samples / MinCbSizeY; + PicHeightInCtbsY = ceil_div(pic_height_in_luma_samples,CtbSizeY); + PicSizeInMinCbsY = PicWidthInMinCbsY * PicHeightInMinCbsY; + PicSizeInCtbsY = PicWidthInCtbsY * PicHeightInCtbsY; + PicSizeInSamplesY = pic_width_in_luma_samples * pic_height_in_luma_samples; + if (chroma_format_idc==0 || separate_colour_plane_flag) { + CtbWidthC = 0; + CtbHeightC = 0; + } + else { + CtbWidthC = CtbSizeY / SubWidthC; + CtbHeightC = CtbSizeY / SubHeightC; + } + Log2MinTrafoSize = log2_min_transform_block_size; + Log2MaxTrafoSize = log2_min_transform_block_size + log2_diff_max_min_transform_block_size; + Log2MinPUSize = Log2MinCbSizeY-1; + PicWidthInMinPUs = PicWidthInCtbsY << (Log2CtbSizeY - Log2MinPUSize); + PicHeightInMinPUs = PicHeightInCtbsY << (Log2CtbSizeY - Log2MinPUSize); + Log2MinIpcmCbSizeY = log2_min_pcm_luma_coding_block_size; + Log2MaxIpcmCbSizeY = (log2_min_pcm_luma_coding_block_size + + log2_diff_max_min_pcm_luma_coding_block_size); + // the following are not in the standard + PicWidthInTbsY = PicWidthInCtbsY << (Log2CtbSizeY - Log2MinTrafoSize); + PicHeightInTbsY = PicHeightInCtbsY << (Log2CtbSizeY - Log2MinTrafoSize); + PicSizeInTbsY = PicWidthInTbsY * PicHeightInTbsY; + sps_read = true; +#endif + + return DE265_OK; +} + + +de265_error sps_range_extension::read(error_queue* errqueue, bitreader* br) +{ + transform_skip_rotation_enabled_flag = get_bits(br,1); + transform_skip_context_enabled_flag = get_bits(br,1); + implicit_rdpcm_enabled_flag = get_bits(br,1); + explicit_rdpcm_enabled_flag = get_bits(br,1); + extended_precision_processing_flag = get_bits(br,1); + intra_smoothing_disabled_flag = get_bits(br,1); + high_precision_offsets_enabled_flag = get_bits(br,1); + persistent_rice_adaptation_enabled_flag = get_bits(br,1); + cabac_bypass_alignment_enabled_flag = get_bits(br,1); + + return DE265_OK; +} + + +#define LOG0(t) log2fh(fh, t) +#define LOG1(t,d) log2fh(fh, t,d) +void sps_range_extension::dump(int fd) const +{ + FILE* fh; + if (fd==1) fh=stdout; + else if (fd==2) fh=stderr; + else { return; } + + LOG0("----------------- SPS-range-extension -----------------\n"); + LOG1("transform_skip_rotation_enabled_flag : %d\n", transform_skip_rotation_enabled_flag); + LOG1("transform_skip_context_enabled_flag : %d\n", transform_skip_context_enabled_flag); + LOG1("implicit_rdpcm_enabled_flag : %d\n", implicit_rdpcm_enabled_flag); + LOG1("explicit_rdpcm_enabled_flag : %d\n", explicit_rdpcm_enabled_flag); + LOG1("extended_precision_processing_flag : %d\n", extended_precision_processing_flag); + LOG1("intra_smoothing_disabled_flag : %d\n", intra_smoothing_disabled_flag); + LOG1("high_precision_offsets_enabled_flag : %d\n", high_precision_offsets_enabled_flag); + LOG1("persistent_rice_adaptation_enabled_flag : %d\n", persistent_rice_adaptation_enabled_flag); + LOG1("cabac_bypass_alignment_enabled_flag : %d\n", cabac_bypass_alignment_enabled_flag); +} +#undef LOG1 +#undef LOG0 diff --git a/threads.cc b/threads.cc new file mode 100644 index 0000000..d21193d --- /dev/null +++ b/threads.cc @@ -0,0 +1,312 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "threads.h" +#include +#include + +#if defined(_MSC_VER) || defined(__MINGW32__) +# include +#elif defined(HAVE_ALLOCA_H) +# include +#endif + + +#ifndef _WIN32 +// #include + +#define THREAD_RESULT void* +#define THREAD_PARAM void* + +#include + +int de265_thread_create(de265_thread* t, void *(*start_routine) (void *), void *arg) { return pthread_create(t,NULL,start_routine,arg); } +void de265_thread_join(de265_thread t) { pthread_join(t,NULL); } +void de265_thread_destroy(de265_thread* t) { } +void de265_mutex_init(de265_mutex* m) { pthread_mutex_init(m,NULL); } +void de265_mutex_destroy(de265_mutex* m) { pthread_mutex_destroy(m); } +void de265_mutex_lock(de265_mutex* m) { pthread_mutex_lock(m); } +void de265_mutex_unlock(de265_mutex* m) { pthread_mutex_unlock(m); } +void de265_cond_init(de265_cond* c) { pthread_cond_init(c,NULL); } +void de265_cond_destroy(de265_cond* c) { pthread_cond_destroy(c); } +void de265_cond_broadcast(de265_cond* c,de265_mutex* m) { pthread_cond_broadcast(c); } +void de265_cond_wait(de265_cond* c,de265_mutex* m) { pthread_cond_wait(c,m); } +void de265_cond_signal(de265_cond* c) { pthread_cond_signal(c); } +#else // _WIN32 + +#define THREAD_RESULT DWORD WINAPI +#define THREAD_PARAM LPVOID + +int de265_thread_create(de265_thread* t, LPTHREAD_START_ROUTINE start_routine, void *arg) { + HANDLE handle = CreateThread(NULL, 0, start_routine, arg, 0, NULL); + if (handle == NULL) { + return -1; + } + *t = handle; + return 0; +} +void de265_thread_join(de265_thread t) { WaitForSingleObject(t, INFINITE); } +void de265_thread_destroy(de265_thread* t) { CloseHandle(*t); *t = NULL; } +void de265_mutex_init(de265_mutex* m) { *m = CreateMutex(NULL, FALSE, NULL); } +void de265_mutex_destroy(de265_mutex* m) { CloseHandle(*m); } +void de265_mutex_lock(de265_mutex* m) { WaitForSingleObject(*m, INFINITE); } +void de265_mutex_unlock(de265_mutex* m) { ReleaseMutex(*m); } +void de265_cond_init(de265_cond* c) { win32_cond_init(c); } +void de265_cond_destroy(de265_cond* c) { win32_cond_destroy(c); } +void de265_cond_broadcast(de265_cond* c,de265_mutex* m) +{ + de265_mutex_lock(m); + win32_cond_broadcast(c); + de265_mutex_unlock(m); +} +void de265_cond_wait(de265_cond* c,de265_mutex* m) { win32_cond_wait(c,m); } +void de265_cond_signal(de265_cond* c) { win32_cond_signal(c); } +#endif // _WIN32 + + + + +de265_progress_lock::de265_progress_lock() +{ + mProgress = 0; + + de265_mutex_init(&mutex); + de265_cond_init(&cond); +} + +de265_progress_lock::~de265_progress_lock() +{ + de265_mutex_destroy(&mutex); + de265_cond_destroy(&cond); +} + +void de265_progress_lock::wait_for_progress(int progress) +{ + if (mProgress >= progress) { + return; + } + + de265_mutex_lock(&mutex); + while (mProgress < progress) { + de265_cond_wait(&cond, &mutex); + } + de265_mutex_unlock(&mutex); +} + +void de265_progress_lock::set_progress(int progress) +{ + de265_mutex_lock(&mutex); + + if (progress>mProgress) { + mProgress = progress; + + de265_cond_broadcast(&cond, &mutex); + } + + de265_mutex_unlock(&mutex); +} + +void de265_progress_lock::increase_progress(int progress) +{ + de265_mutex_lock(&mutex); + + mProgress += progress; + de265_cond_broadcast(&cond, &mutex); + + de265_mutex_unlock(&mutex); +} + +int de265_progress_lock::get_progress() const +{ + return mProgress; +} + + + + +#include "libde265/decctx.h" + +#if 0 +const char* line="--------------------------------------------------"; +void printblks(const thread_pool* pool) +{ + int w = pool->tasks[0].data.task_ctb.ctx->current_sps->PicWidthInCtbsY; + int h = pool->tasks[0].data.task_ctb.ctx->current_sps->PicHeightInCtbsY; + + printf("active threads: %d queue len: %d\n",pool->num_threads_working,pool->num_tasks); + + char *const p = (char *)alloca(w * h * sizeof(char)); + assert(p != NULL); + memset(p,' ',w*h); + + for (int i=0;inum_tasks;i++) { + int b = 0; //pool->tasks[i].num_blockers; + int x = pool->tasks[i].data.task_ctb.ctb_x; + int y = pool->tasks[i].data.task_ctb.ctb_y; + p[y*w+x] = b+'0'; + } + + for (int i=0;inum_threads_working;i++) { + int x = pool->ctbx[i]; + int y = pool->ctby[i]; + p[y*w+x] = '*'; + } + + printf("+%s+\n",line+50-w); + for (int y=0;ymutex); + + while(true) { + + // wait until we can pick a task or until the pool has been stopped + + for (;;) { + // end waiting if thread-pool has been stopped or we have a task to execute + + if (pool->stopped || pool->tasks.size()>0) { + break; + } + + //printf("going idle\n"); + de265_cond_wait(&pool->cond_var, &pool->mutex); + } + + // if the pool was shut down, end the execution + + if (pool->stopped) { + de265_mutex_unlock(&pool->mutex); + return NULL; + } + + + // get a task + + thread_task* task = pool->tasks.front(); + pool->tasks.pop_front(); + + pool->num_threads_working++; + + //printblks(pool); + + de265_mutex_unlock(&pool->mutex); + + + // execute the task + + task->work(); + + // end processing and check if this was the last task to be processed + + de265_mutex_lock(&pool->mutex); + + pool->num_threads_working--; + } + de265_mutex_unlock(&pool->mutex); + + return NULL; +} + + +de265_error start_thread_pool(thread_pool* pool, int num_threads) +{ + de265_error err = DE265_OK; + + // limit number of threads to maximum + + if (num_threads > MAX_THREADS) { + num_threads = MAX_THREADS; + err = DE265_WARNING_NUMBER_OF_THREADS_LIMITED_TO_MAXIMUM; + } + + pool->num_threads = 0; // will be increased below + + de265_mutex_init(&pool->mutex); + de265_cond_init(&pool->cond_var); + + de265_mutex_lock(&pool->mutex); + pool->num_threads_working = 0; + pool->stopped = false; + de265_mutex_unlock(&pool->mutex); + + // start worker threads + + for (int i=0; ithread[i], worker_thread, pool); + if (ret != 0) { + // cerr << "pthread_create() failed: " << ret << endl; + return DE265_ERROR_CANNOT_START_THREADPOOL; + } + + pool->num_threads++; + } + + return err; +} + + +void stop_thread_pool(thread_pool* pool) +{ + de265_mutex_lock(&pool->mutex); + pool->stopped = true; + de265_mutex_unlock(&pool->mutex); + + de265_cond_broadcast(&pool->cond_var, &pool->mutex); + + for (int i=0;inum_threads;i++) { + de265_thread_join(pool->thread[i]); + de265_thread_destroy(&pool->thread[i]); + } + + de265_mutex_destroy(&pool->mutex); + de265_cond_destroy(&pool->cond_var); +} + + +void add_task(thread_pool* pool, thread_task* task) +{ + de265_mutex_lock(&pool->mutex); + if (!pool->stopped) { + + pool->tasks.push_back(task); + + // wake up one thread + + de265_cond_signal(&pool->cond_var); + } + de265_mutex_unlock(&pool->mutex); +} diff --git a/transform.cc b/transform.cc new file mode 100644 index 0000000..ef404f8 --- /dev/null +++ b/transform.cc @@ -0,0 +1,739 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "transform.h" +#include "util.h" + +#include + + +const int tab8_22[] = { 29,30,31,32,33,33,34,34,35,35,36,36,37 /*,37*/ }; + + +// (8.6.1) +void decode_quantization_parameters(thread_context* tctx, int xC,int yC, + int xCUBase, int yCUBase) +{ + logtrace(LogTransform,">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> decode_quantization_parameters(int xC,int yC)=(%d,%d)\n", xC,yC); + + const pic_parameter_set& pps = tctx->img->get_pps(); + const seq_parameter_set& sps = tctx->img->get_sps(); + slice_segment_header* shdr = tctx->shdr; + + // top left pixel position of current quantization group + int xQG = xCUBase - (xCUBase & ((1<currentQG_x && + yQG == tctx->currentQG_y) + { + return; + } + */ + + // if first QG in CU, remember last QPY of last CU previous QG + + if (xQG != tctx->currentQG_x || + yQG != tctx->currentQG_y) + { + tctx->lastQPYinPreviousQG = tctx->currentQPY; + tctx->currentQG_x = xQG; + tctx->currentQG_y = yQG; + } + + int qPY_PRED; + + // first QG in CTB row ? + + int ctbLSBMask = ((1<shdr->SliceAddrRS; + + int SliceStartX = (first_ctb_in_slice_RS % sps.PicWidthInCtbsY) * sps.CtbSizeY; + int SliceStartY = (first_ctb_in_slice_RS / sps.PicWidthInCtbsY) * sps.CtbSizeY; + + bool firstQGInSlice = (SliceStartX == xQG && SliceStartY == yQG); + + // first QG in tile ? + + bool firstQGInTile = false; + if (pps.tiles_enabled_flag) { + if ((xQG & ((1 << sps.Log2CtbSizeY)-1)) == 0 && + (yQG & ((1 << sps.Log2CtbSizeY)-1)) == 0) + { + int ctbX = xQG >> sps.Log2CtbSizeY; + int ctbY = yQG >> sps.Log2CtbSizeY; + + firstQGInTile = pps.is_tile_start_CTB(ctbX,ctbY); // TODO: this is slow + } + } + + + if (firstQGInSlice || firstQGInTile || + (firstInCTBRow && pps.entropy_coding_sync_enabled_flag)) { + qPY_PRED = tctx->shdr->SliceQPY; + } + else { + qPY_PRED = tctx->lastQPYinPreviousQG; + } + + + int qPYA,qPYB; + + if (tctx->img->available_zscan(xQG,yQG, xQG-1,yQG)) { + int xTmp = (xQG-1) >> sps.Log2MinTrafoSize; + int yTmp = (yQG ) >> sps.Log2MinTrafoSize; + int minTbAddrA = pps.MinTbAddrZS[xTmp + yTmp*sps.PicWidthInTbsY]; + int ctbAddrA = minTbAddrA >> (2 * (sps.Log2CtbSizeY-sps.Log2MinTrafoSize)); + if (ctbAddrA == tctx->CtbAddrInTS) { + qPYA = tctx->img->get_QPY(xQG-1,yQG); + } + else { + qPYA = qPY_PRED; + } + } + else { + qPYA = qPY_PRED; + } + + if (tctx->img->available_zscan(xQG,yQG, xQG,yQG-1)) { + int xTmp = (xQG ) >> sps.Log2MinTrafoSize; + int yTmp = (yQG-1) >> sps.Log2MinTrafoSize; + int minTbAddrB = pps.MinTbAddrZS[xTmp + yTmp*sps.PicWidthInTbsY]; + int ctbAddrB = minTbAddrB >> (2 * (sps.Log2CtbSizeY-sps.Log2MinTrafoSize)); + if (ctbAddrB == tctx->CtbAddrInTS) { + qPYB = tctx->img->get_QPY(xQG,yQG-1); + } + else { + qPYB = qPY_PRED; + } + } + else { + qPYB = qPY_PRED; + } + + qPY_PRED = (qPYA + qPYB + 1)>>1; + + logtrace(LogTransform,"qPY_PRED = %d (%d, %d)\n",qPY_PRED, qPYA, qPYB); + + int QPY = ((qPY_PRED + tctx->CuQpDelta + 52+2*sps.QpBdOffset_Y) % + (52 + sps.QpBdOffset_Y)) - sps.QpBdOffset_Y; + + tctx->qPYPrime = QPY + sps.QpBdOffset_Y; + if (tctx->qPYPrime<0) { + tctx->qPYPrime=0; + } + + int qPiCb = Clip3(-sps.QpBdOffset_C,57, QPY+pps.pic_cb_qp_offset + shdr->slice_cb_qp_offset + tctx->CuQpOffsetCb); + int qPiCr = Clip3(-sps.QpBdOffset_C,57, QPY+pps.pic_cr_qp_offset + shdr->slice_cr_qp_offset + tctx->CuQpOffsetCr); + + logtrace(LogTransform,"qPiCb:%d (%d %d), qPiCr:%d (%d %d)\n", + qPiCb, pps.pic_cb_qp_offset, shdr->slice_cb_qp_offset, + qPiCr, pps.pic_cr_qp_offset, shdr->slice_cr_qp_offset); + + int qPCb,qPCr; + + if (sps.ChromaArrayType == CHROMA_420) { + qPCb = table8_22(qPiCb); + qPCr = table8_22(qPiCr); + } + else { + qPCb = qPiCb; + qPCr = qPiCr; + } + + //printf("q: %d %d\n",qPiCb, qPCb); + + tctx->qPCbPrime = qPCb + sps.QpBdOffset_C; + if (tctx->qPCbPrime<0) { + tctx->qPCbPrime = 0; + } + + tctx->qPCrPrime = qPCr + sps.QpBdOffset_C; + if (tctx->qPCrPrime<0) { + tctx->qPCrPrime = 0; + } + + /* + printf("Q: %d (%d %d %d / %d %d) %d %d %d\n",QPY, + sps->QpBdOffset_Y, + pps->pic_cb_qp_offset + shdr->slice_cb_qp_offset, + pps->pic_cr_qp_offset + shdr->slice_cr_qp_offset, + sps->QpBdOffset_C, sps->QpBdOffset_C, + tctx->qPYPrime, tctx->qPCbPrime, tctx->qPCrPrime); + */ + + int log2CbSize = tctx->img->get_log2CbSize(xCUBase, yCUBase); + + // TODO: On broken input, log2CbSize may be zero (multithreaded only). Not sure yet why. + // Maybe another decoding thread is overwriting the value set in slice.cc:read_coding_unit. + // id:000163,sig:06,src:002041,op:havoc,rep:16.bin + if (log2CbSize<3) { log2CbSize=3; } + + tctx->img->set_QPY(xCUBase, yCUBase, log2CbSize, QPY); + tctx->currentQPY = QPY; + + /* + printf("SET QPY POC=%d %d;%d-%d;%d = %d\n",ctx->img->PicOrderCntVal,xCUBase,yCUBase, + xCUBase+(1<qPYPrime); +} + + + +template +void transform_coefficients(acceleration_functions* acceleration, + int16_t* coeff, int coeffStride, int nT, int trType, + pixel_t* dst, int dstStride, int bit_depth) +{ + logtrace(LogTransform,"transform --- trType: %d nT: %d\n",trType,nT); + + + if (trType==1) { + + acceleration->transform_4x4_dst_add(dst, coeff, dstStride, bit_depth); + + } else { + + /**/ if (nT==4) { acceleration->transform_add(0,dst,coeff,dstStride, bit_depth); } + else if (nT==8) { acceleration->transform_add(1,dst,coeff,dstStride, bit_depth); } + else if (nT==16) { acceleration->transform_add(2,dst,coeff,dstStride, bit_depth); } + else { acceleration->transform_add(3,dst,coeff,dstStride, bit_depth); } + } + +#if 0 + printf("decoded pixels:\n"); + for (int y=0;yimg->get_sps().BitDepth_C; + const int BitDepthY = tctx->img->get_sps().BitDepth_Y; + + for (int y=0;yBitDepthC, for which we could also eliminate one shift. The remaining + case is also one shift only. + */ + + residual[y*nT+x] += (tctx->ResScaleVal * + ((tctx->residual_luma[y*nT+x] << BitDepthC ) >> BitDepthY ) ) >> 3; + } +} + + +template +void transform_coefficients_explicit(thread_context* tctx, + int16_t* coeff, int coeffStride, int nT, int trType, + pixel_t* dst, int dstStride, int bit_depth, int cIdx) +{ + logtrace(LogTransform,"transform --- trType: %d nT: %d\n",trType,nT); + + const acceleration_functions* acceleration = &tctx->decctx->acceleration; + + int32_t residual_buffer[32*32]; + int32_t* residual; + if (cIdx==0) { + residual = tctx->residual_luma; + } + else { + residual = residual_buffer; + } + + + // TODO + int bdShift = 20 - bit_depth; + int max_coeff_bits = 15; + + if (trType==1) { + + acceleration->transform_idst_4x4(residual, coeff, bdShift, max_coeff_bits); + + } else { + + /**/ if (nT==4) { acceleration->transform_idct_4x4(residual,coeff,bdShift,max_coeff_bits); } + else if (nT==8) { acceleration->transform_idct_8x8(residual,coeff,bdShift,max_coeff_bits); } + else if (nT==16) { acceleration->transform_idct_16x16(residual,coeff,bdShift,max_coeff_bits); } + else { acceleration->transform_idct_32x32(residual,coeff,bdShift,max_coeff_bits); } + } + + + //printBlk("prediction",(uint8_t*)dst,nT,dstStride); + //printBlk("residual",residual,nT,nT); + + if (cIdx != 0) { + if (tctx->ResScaleVal != 0) { + cross_comp_pred(tctx, residual, nT); + } + + //printBlk("cross-comp-pred modified residual",residual,nT,nT); + } + + acceleration->add_residual(dst,dstStride, residual,nT, bit_depth); +} + + +void inv_transform(acceleration_functions* acceleration, + uint8_t* dst, int dstStride, int16_t* coeff, + int log2TbSize, int trType) +{ + if (trType==1) { + assert(log2TbSize==2); + + acceleration->transform_4x4_dst_add_8(dst, coeff, dstStride); + + } else { + acceleration->transform_add_8[log2TbSize-2](dst,coeff,dstStride); + } + + +#if 0 + int nT = 1<fwd_transform_4x4_dst_8(coeff, src, srcStride); + } else { + // DCT 4x4, 8x8, 16x16, 32x32 + + acceleration->fwd_transform_8[log2TbSize-2](coeff,src,srcStride); + } +} + + + +static const int levelScale[] = { 40,45,51,57,64,72 }; + +// (8.6.2) and (8.6.3) +template +void scale_coefficients_internal(thread_context* tctx, + int xT,int yT, // position of TU in frame (chroma adapted) + int x0,int y0, // position of CU in frame (chroma adapted) + int nT, int cIdx, + bool transform_skip_flag, bool intra, int rdpcmMode) +{ + const seq_parameter_set& sps = tctx->img->get_sps(); + const pic_parameter_set& pps = tctx->img->get_pps(); + + int qP; + switch (cIdx) { + case 0: qP = tctx->qPYPrime; break; + case 1: qP = tctx->qPCbPrime; break; + case 2: qP = tctx->qPCrPrime; break; + default: qP = 0; assert(0); break; // should never happen + } + + logtrace(LogTransform,"qP: %d\n",qP); + + + int16_t* coeff; + int coeffStride; + + coeff = tctx->coeffBuf; + coeffStride = nT; + + + + + + pixel_t* pred; + int stride; + pred = tctx->img->get_image_plane_at_pos_NEW(cIdx, xT,yT); + stride = tctx->img->get_image_stride(cIdx); + + // We explicitly include the case for sizeof(pixel_t)==1 so that the compiler + // can optimize away a lot of code for 8-bit pixels. + const int bit_depth = ((sizeof(pixel_t)==1) ? 8 : sps.get_bit_depth(cIdx)); + + //assert(intra == (tctx->img->get_pred_mode(xT,yT)==MODE_INTRA)); + int cuPredModeIntra = (tctx->img->get_pred_mode(xT,yT)==MODE_INTRA); + + bool rotateCoeffs = (sps.range_extension.transform_skip_rotation_enabled_flag && + nT == 4 && + cuPredModeIntra); + + if (tctx->cu_transquant_bypass_flag) { + + int32_t residual_buffer[32*32]; + + int32_t* residual; + if (cIdx==0) residual = tctx->residual_luma; + else residual = residual_buffer; + + + // TODO: we could fold the coefficient rotation into the coefficient expansion here: + for (int i=0;inCoeff[cIdx];i++) { + int32_t currCoeff = tctx->coeffList[cIdx][i]; + tctx->coeffBuf[ tctx->coeffPos[cIdx][i] ] = currCoeff; + } + + if (rotateCoeffs) { + tctx->decctx->acceleration.rotate_coefficients(coeff, nT); + } + + if (rdpcmMode) { + if (rdpcmMode==2) + tctx->decctx->acceleration.transform_bypass_rdpcm_v(residual, coeff, nT); + else + tctx->decctx->acceleration.transform_bypass_rdpcm_h(residual, coeff, nT); + } + else { + tctx->decctx->acceleration.transform_bypass(residual, coeff, nT); + } + + if (cIdx != 0) { + if (tctx->ResScaleVal != 0) { + cross_comp_pred(tctx, residual, nT); + } + } + + tctx->decctx->acceleration.add_residual(pred,stride, residual,nT, bit_depth); + + if (rotateCoeffs) { + memset(coeff, 0, nT*nT*sizeof(int16_t)); // delete all, because we moved the coeffs around + } + } + else { + // (8.6.3) + + int bdShift = (cIdx==0 ? sps.BitDepth_Y : sps.BitDepth_C) + Log2(nT) - 5; + + logtrace(LogTransform,"bdShift=%d\n",bdShift); + + logtrace(LogTransform,"dequant %d;%d cIdx=%d qp=%d\n",xT*(cIdx?2:1),yT*(cIdx?2:1),cIdx,qP); + + + // --- inverse quantization --- + + if (sps.scaling_list_enable_flag==0) { + + //const int m_x_y = 16; + const int m_x_y = 1; + bdShift -= 4; // this is equivalent to having a m_x_y of 16 and we can use 32bit integers + + const int offset = (1<<(bdShift-1)); + const int fact = m_x_y * levelScale[qP%6] << (qP/6); + + for (int i=0;inCoeff[cIdx];i++) { + + // usually, this needs to be 64bit, but because we modify the shift above, we can use 16 bit + int32_t currCoeff = tctx->coeffList[cIdx][i]; + + //logtrace(LogTransform,"coefficient[%d] = %d\n",tctx->coeffPos[cIdx][i], + //tctx->coeffList[cIdx][i]); + + currCoeff = Clip3(-32768,32767, + ( (currCoeff * fact + offset ) >> bdShift)); + + //logtrace(LogTransform," -> %d\n",currCoeff); + + tctx->coeffBuf[ tctx->coeffPos[cIdx][i] ] = currCoeff; + } + } + else { + const int offset = (1<<(bdShift-1)); + + const uint8_t* sclist; + int matrixID = cIdx; + if (!intra) { + if (nT<32) { matrixID += 3; } + else { matrixID++; } + } + + switch (nT) { + case 4: sclist = &pps.scaling_list.ScalingFactor_Size0[matrixID][0][0]; break; + case 8: sclist = &pps.scaling_list.ScalingFactor_Size1[matrixID][0][0]; break; + case 16: sclist = &pps.scaling_list.ScalingFactor_Size2[matrixID][0][0]; break; + case 32: sclist = &pps.scaling_list.ScalingFactor_Size3[matrixID][0][0]; break; + default: assert(0); + } + + for (int i=0;inCoeff[cIdx];i++) { + int pos = tctx->coeffPos[cIdx][i]; + int x = pos%nT; + int y = pos/nT; + + const int m_x_y = sclist[x+y*nT]; + const int fact = m_x_y * levelScale[qP%6] << (qP/6); + + int64_t currCoeff = tctx->coeffList[cIdx][i]; + + currCoeff = Clip3(-32768,32767, + ( (currCoeff * fact + offset ) >> bdShift)); + + tctx->coeffBuf[ tctx->coeffPos[cIdx][i] ] = currCoeff; + } + } + + + // --- do transform or skip --- + + logtrace(LogTransform,"coefficients OUT:\n"); + for (int y=0;ydecctx->acceleration.rotate_coefficients(coeff, nT); + } + + int32_t residual_buffer[32*32]; + + int32_t* residual; + if (cIdx==0) residual = tctx->residual_luma; + else residual = residual_buffer; + + if (rdpcmMode) { + /* + if (rdpcmMode==2) + tctx->decctx->acceleration.transform_skip_rdpcm_v(pred,coeff, Log2(nT), stride, bit_depth); + else + tctx->decctx->acceleration.transform_skip_rdpcm_h(pred,coeff, Log2(nT), stride, bit_depth); + */ + + if (rdpcmMode==2) + tctx->decctx->acceleration.rdpcm_v(residual, coeff,nT, tsShift,bdShift); + else + tctx->decctx->acceleration.rdpcm_h(residual, coeff,nT, tsShift,bdShift); + } + else { + //tctx->decctx->acceleration.transform_skip(pred, coeff, stride, bit_depth); + + tctx->decctx->acceleration.transform_skip_residual(residual, coeff, nT, tsShift, bdShift); + } + + if (cIdx != 0) { + if (tctx->ResScaleVal != 0) { + cross_comp_pred(tctx, residual, nT); + } + } + + tctx->decctx->acceleration.add_residual(pred,stride, residual,nT, bit_depth); + + if (rotateCoeffs) { + memset(coeff, 0, nT*nT*sizeof(int16_t)); // delete all, because we moved the coeffs around + } + } + else { + int trType; + + //if (nT==4 && cIdx==0 && tctx->img->get_pred_mode(xT,yT)==MODE_INTRA) { + if (nT==4 && cIdx==0 && cuPredModeIntra) { + trType=1; + } + else { + trType=0; + } + + assert(rdpcmMode==0); + + + if (tctx->img->get_pps().range_extension.cross_component_prediction_enabled_flag) { + // cross-component-prediction: transform to residual buffer and add in a separate step + + transform_coefficients_explicit(tctx, coeff, coeffStride, nT, trType, + pred, stride, bit_depth, cIdx); + } + else { + transform_coefficients(&tctx->decctx->acceleration, coeff, coeffStride, nT, trType, + pred, stride, bit_depth); + } + } + } + + + logtrace(LogTransform,"pixels (cIdx:%d), position %d %d:\n",cIdx, xT,yT); + + for (int y=0;ynCoeff[cIdx];i++) { + tctx->coeffBuf[ tctx->coeffPos[cIdx][i] ] = 0; + } +} + + +void scale_coefficients(thread_context* tctx, + int xT,int yT, // position of TU in frame (chroma adapted) + int x0,int y0, // position of CU in frame (chroma adapted) + int nT, int cIdx, + bool transform_skip_flag, bool intra, + int rdpcmMode // 0 - off, 1 - Horizontal, 2 - Vertical + ) +{ + if (tctx->img->high_bit_depth(cIdx)) { + scale_coefficients_internal(tctx, xT,yT, x0,y0, nT,cIdx, transform_skip_flag, intra, + rdpcmMode); + } else { + scale_coefficients_internal (tctx, xT,yT, x0,y0, nT,cIdx, transform_skip_flag, intra, + rdpcmMode); + } +} + + +//#define QUANT_IQUANT_SHIFT 20 // Q(QP%6) * IQ(QP%6) = 2^20 +#define QUANT_SHIFT 14 // Q(4) = 2^14 +//#define SCALE_BITS 15 // Inherited from TMuC, pressumably for fractional bit estimates in RDOQ +#define MAX_TR_DYNAMIC_RANGE 15 // Maximum transform dynamic range (excluding sign bit) + + +const static uint16_t g_quantScales[6] = { + 26214,23302,20560,18396,16384,14564 +}; + +void quant_coefficients(//encoder_context* ectx, + int16_t* out_coeff, + const int16_t* in_coeff, + int log2TrSize, int qp, + bool intra) +{ + const int qpDiv6 = qp / 6; + const int qpMod6 = qp % 6; + + //int uiLog2TrSize = xLog2( iWidth - 1); + + int uiQ = g_quantScales[qpMod6]; + int bitDepth = 8; + int transformShift = MAX_TR_DYNAMIC_RANGE - bitDepth - log2TrSize; // Represents scaling through forward transform + int qBits = QUANT_SHIFT + qpDiv6 + transformShift; + + /* TODO: originally, this was checking for intra slices, why not for intra mode ? + */ + int rnd = (intra ? 171 : 85) << (qBits-9); + + int x, y; + int uiAcSum = 0; + + int nStride = (1< ", x,y,level); + sign = (level < 0 ? -1: 1); + + level = (abs_value(level) * uiQ + rnd ) >> qBits; + uiAcSum += level; + level *= sign; + out_coeff[blockPos] = Clip3(-32768, 32767, level); + //logtrace(LogTransform,"%d\n", out_coeff[blockPos]); + } + } +} + + +void dequant_coefficients(int16_t* out_coeff, + const int16_t* in_coeff, + int log2TrSize, int qP) +{ + const int m_x_y = 1; + int bitDepth = 8; + int bdShift = bitDepth + log2TrSize - 5; + bdShift -= 4; // this is equivalent to having a m_x_y of 16 and we can use 32bit integers + + const int offset = (1<<(bdShift-1)); + const int fact = m_x_y * levelScale[qP%6] << (qP/6); + + int blkSize = (1<> bdShift)); + + //logtrace(LogTransform," -> %d\n",currCoeff); + + out_coeff[i] = currCoeff; + } +} diff --git a/util.cc b/util.cc new file mode 100644 index 0000000..61be238 --- /dev/null +++ b/util.cc @@ -0,0 +1,247 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "util.h" +#include "de265.h" + +#include +#include +#include + + +void copy_subimage(uint8_t* dst,int dststride, + const uint8_t* src,int srcstride, + int w, int h) +{ + for (int y=0;y=2; +} +#endif + +#ifdef DE265_LOG_TRACE +void logtrace(enum LogModule module, const char* string, ...) +{ + if (verbosity<3) return; + if (current_poc < log_poc_start) { return; } + if (disable_log[module]) return; + + //if (module != LogSymbols /*&& module != LogCABAC*/) { return; } + //if (logcnt<319500) return; + + //if (module != LogCABAC) return; + + va_list va; + + if (string[0]=='$') { + int id = string[1]-'0'; + logcnt[id]++; + fprintf(stdout, "[%ld] ",logcnt[id]); + + string += 3; + } + + int noPrefix = (string[0]=='*'); + if (!noPrefix) { } // fprintf(stdout, "ERR: "); + va_start(va, string); + vfprintf(stdout, string + (noPrefix ? 1 : 0), va); + va_end(va); + fflush(stdout); +} +#endif + +void log2fh(FILE* fh, const char* string, ...) +{ + va_list va; + + int noPrefix = (string[0]=='*'); + if (!noPrefix) fprintf(stdout, "INFO: "); + va_start(va, string); + vfprintf(fh, string + (noPrefix ? 1 : 0), va); + va_end(va); + fflush(stdout); +} + + + +void printBlk(const char* title, const int16_t* data, int blksize, int stride, + const std::string& prefix) +{ + if (title) printf("%s%s:\n",prefix.c_str(),title); + + for (int y=0;y + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "visualize.h" +#include "decctx.h" + +#include + +#if 0 +void writeFrame_Y(de265_image* img,const char* filename) +{ + int w = ctx->img->get_width(); + int h = ctx->img->get_height(); + //int c_idx=0; + int ctb_size = 64; // HACK + + int stride = ctx->img->get_luma_stride(); + + for (int ctbY=0;ctbYcurrent_sps->PicHeightInCtbsY;ctbY++) + for (int ctbX=0;ctbXcurrent_sps->PicWidthInCtbsY;ctbX++) + { + int x0 = ctbX*ctb_size; + int y0 = ctbY*ctb_size; + + + uint8_t *src = ctx->img->get_image_plane_at_pos(0,x0,y0); + + printf("%s %d %d\n",filename,x0,y0); + int dx,dy; + for (dy=0;dyget_image_plane_at_pos(c, 0,y), de265_get_image_width(img,c), 1, fh); + + fflush(fh); + fclose(fh); +} + + +void set_pixel(uint8_t* img, int x,int y, int stride, uint32_t color, int pixelSize) +{ + for (int i=0;i>(i*8)) & 0xFF; + img[y*stride + x*pixelSize + i] = col; + } +} + + +void draw_block_boundary(const de265_image* srcimg, + uint8_t* img,int stride, + int x,int y,int hBlkSize, int vBlkSize, uint32_t color, int pixelSize) +{ + for (int i=0;iget_sps().pic_height_in_luma_samples) { + set_pixel(img,x,yi,stride,color,pixelSize); + } + } + + for (int i=0;iget_sps().pic_width_in_luma_samples) { + set_pixel(img,xi,y,stride,color,pixelSize); + } + } +} + + +#include "intrapred.h" + +void draw_intra_pred_mode(const de265_image* srcimg, + uint8_t* img,int stride, + int x0,int y0,int log2BlkSize, + enum IntraPredMode mode, uint32_t color,int pixelSize) +{ + int w = 1< draw square + + for (int i=-w*1/4;i<=w*1/4;i++) + { + set_pixel(img, x0+w*1/4, y0+w/2+i,stride, color, pixelSize); + set_pixel(img, x0+w*3/4, y0+w/2+i,stride, color, pixelSize); + set_pixel(img, x0+w/2+i, y0+w*1/4,stride, color, pixelSize); + set_pixel(img, x0+w/2+i, y0+w*3/4,stride, color, pixelSize); + } + } + else if (mode==1) { + // DC -> draw circle + + for (int i=-w/4;i draw line in prediction direction + + int slope = intraPredAngle_table[mode]; + bool horiz = (mode<18); + + if (horiz) { + for (int i=-w/2;i=0 && yget_sps().pic_height_in_luma_samples) { + set_pixel(img, x0+i+w/2, y, stride, color, pixelSize); + } + } + } + else { + for (int i=-w/2;i=0 && xget_sps().pic_width_in_luma_samples) { + set_pixel(img, x, y0+i+w/2, stride, color, pixelSize); + } + } + } + } +} + + +void drawTBgrid(const de265_image* srcimg, uint8_t* img, int stride, + int x0,int y0, uint32_t color, int pixelSize, int log2CbSize, int trafoDepth) +{ + int split_transform_flag = srcimg->get_split_transform_flag(x0,y0,trafoDepth); + if (split_transform_flag) { + int x1 = x0 + ((1<<(log2CbSize-trafoDepth))>>1); + int y1 = y0 + ((1<<(log2CbSize-trafoDepth))>>1); + drawTBgrid(srcimg,img,stride,x0,y0,color,pixelSize,log2CbSize,trafoDepth+1); + drawTBgrid(srcimg,img,stride,x1,y0,color,pixelSize,log2CbSize,trafoDepth+1); + drawTBgrid(srcimg,img,stride,x0,y1,color,pixelSize,log2CbSize,trafoDepth+1); + drawTBgrid(srcimg,img,stride,x1,y1,color,pixelSize,log2CbSize,trafoDepth+1); + } + else { + draw_block_boundary(srcimg,img,stride,x0,y0,1<<(log2CbSize-trafoDepth),1<<(log2CbSize-trafoDepth), color, pixelSize); + } +} + + +enum DrawMode { + Partitioning_CB, + Partitioning_TB, + Partitioning_PB, + IntraPredMode, + PBPredMode, + PBMotionVectors, + QuantP_Y +}; + + +void tint_rect(uint8_t* img, int stride, int x0,int y0,int w,int h, uint32_t color, int pixelSize) +{ + for (int y=0;y>(i*8)) & 0xFF; + img[yp*stride+xp*pixelSize + i] = (img[yp*stride+xp*pixelSize + i] + col)/2; + } + } +} + +void fill_rect(uint8_t* img, int stride, int x0,int y0,int w,int h, uint32_t color, int pixelSize) +{ + for (int y=0;y>(i*8)) & 0xFF; + img[yp*stride+xp*pixelSize + i] = col; + } + } +} + + +void draw_QuantPY_block(const de265_image* srcimg,uint8_t* img,int stride, + int x0,int y0, int w,int h, int pixelSize) +{ + int q = srcimg->get_QPY(x0,y0); + + const int MIN_DRAW_Q = 20; + const int MAX_DRAW_Q = 40; + + if (qMAX_DRAW_Q) q=MAX_DRAW_Q; + + float f = ((float)q-MIN_DRAW_Q)/(MAX_DRAW_Q-MIN_DRAW_Q); + uint32_t col = 0xFF * f; + col = col | (col<<8) | (col<<16); + + fill_rect(img,stride, x0,y0,w,h, col, pixelSize); +} + + +void draw_line(uint8_t* img,int stride,uint32_t color,int pixelSize, + int width,int height, + int x0,int y0,int x1,int y1) +{ + if (x1==x0 && y1==y0) { + set_pixel(img,x0,y0,stride,color,pixelSize); + } + else if (abs(x1-x0) < abs(y1-y0)) { + for (int y=y0;y<=y1;y += Sign(y1-y0)) + { + int x = (y-y0)*(x1-x0)/(y1-y0) + x0; + + if (x>=0 && x=0 && y=0 && x=0 && yget_pred_mode(x0,y0); + + uint32_t cols[3] = { 0xff0000, 0x0000ff, 0x00ff00 }; + + tint_rect(img,stride, x0,y0,w,h, cols[predMode], pixelSize); + } + else if (what == PBMotionVectors) { + const PBMotion& mvi = srcimg->get_mv_info(x0,y0); + int x = x0+w/2; + int y = y0+h/2; + if (mvi.predFlag[0]) { + draw_line(img,stride,0xFF0000,pixelSize, + srcimg->get_width(), + srcimg->get_height(), + x,y,x+mvi.mv[0].x,y+mvi.mv[0].y); + } + if (mvi.predFlag[1]) { + draw_line(img,stride,0x00FF00,pixelSize, + srcimg->get_width(), + srcimg->get_height(), + x,y,x+mvi.mv[1].x,y+mvi.mv[1].y); + } + } +} + + +void draw_tree_grid(const de265_image* srcimg, uint8_t* img, int stride, + uint32_t color, int pixelSize, enum DrawMode what) +{ + const seq_parameter_set& sps = srcimg->get_sps(); + int minCbSize = sps.MinCbSizeY; + + for (int y0=0;y0get_log2CbSize_cbUnits(x0,y0); + if (log2CbSize==0) { + continue; + } + + int xb = x0*minCbSize; + int yb = y0*minCbSize; + + int CbSize = 1<get_PartMode(xb,yb); + + int HalfCbSize = (1<<(log2CbSize-1)); + + switch (partMode) { + case PART_2Nx2N: + draw_PB_block(srcimg,img,stride,xb,yb,CbSize,CbSize, what,color,pixelSize); + break; + case PART_NxN: + draw_PB_block(srcimg,img,stride,xb, yb, CbSize/2,CbSize/2, what,color,pixelSize); + draw_PB_block(srcimg,img,stride,xb+HalfCbSize,yb, CbSize/2,CbSize/2, what,color,pixelSize); + draw_PB_block(srcimg,img,stride,xb ,yb+HalfCbSize,CbSize/2,CbSize/2, what,color,pixelSize); + draw_PB_block(srcimg,img,stride,xb+HalfCbSize,yb+HalfCbSize,CbSize/2,CbSize/2, what,color,pixelSize); + break; + case PART_2NxN: + draw_PB_block(srcimg,img,stride,xb, yb, CbSize ,CbSize/2, what,color,pixelSize); + draw_PB_block(srcimg,img,stride,xb, yb+HalfCbSize,CbSize ,CbSize/2, what,color,pixelSize); + break; + case PART_Nx2N: + draw_PB_block(srcimg,img,stride,xb, yb, CbSize/2,CbSize, what,color,pixelSize); + draw_PB_block(srcimg,img,stride,xb+HalfCbSize,yb, CbSize/2,CbSize, what,color,pixelSize); + break; + case PART_2NxnU: + draw_PB_block(srcimg,img,stride,xb, yb, CbSize ,CbSize/4, what,color,pixelSize); + draw_PB_block(srcimg,img,stride,xb, yb+CbSize/4 ,CbSize ,CbSize*3/4, what,color,pixelSize); + break; + case PART_2NxnD: + draw_PB_block(srcimg,img,stride,xb, yb, CbSize ,CbSize*3/4, what,color,pixelSize); + draw_PB_block(srcimg,img,stride,xb, yb+CbSize*3/4,CbSize ,CbSize/4, what,color,pixelSize); + break; + case PART_nLx2N: + draw_PB_block(srcimg,img,stride,xb, yb, CbSize/4 ,CbSize, what,color,pixelSize); + draw_PB_block(srcimg,img,stride,xb+CbSize/4 ,yb, CbSize*3/4,CbSize, what,color,pixelSize); + break; + case PART_nRx2N: + draw_PB_block(srcimg,img,stride,xb, yb, CbSize*3/4,CbSize, what,color,pixelSize); + draw_PB_block(srcimg,img,stride,xb+CbSize*3/4,yb, CbSize/4 ,CbSize, what,color,pixelSize); + break; + default: + assert(false); + break; + } + } + else if (what==IntraPredMode) { + enum PredMode predMode = srcimg->get_pred_mode(xb,yb); + if (predMode == MODE_INTRA) { + enum PartMode partMode = srcimg->get_PartMode(xb,yb); + + int HalfCbSize = (1<<(log2CbSize-1)); + + switch (partMode) { + case PART_2Nx2N: + draw_intra_pred_mode(srcimg,img,stride,xb,yb,log2CbSize, + srcimg->get_IntraPredMode(xb,yb), color,pixelSize); + break; + case PART_NxN: + draw_intra_pred_mode(srcimg,img,stride,xb, yb, log2CbSize-1, + srcimg->get_IntraPredMode(xb,yb), color,pixelSize); + draw_intra_pred_mode(srcimg,img,stride,xb+HalfCbSize,yb, log2CbSize-1, + srcimg->get_IntraPredMode(xb+HalfCbSize,yb), color,pixelSize); + draw_intra_pred_mode(srcimg,img,stride,xb ,yb+HalfCbSize,log2CbSize-1, + srcimg->get_IntraPredMode(xb,yb+HalfCbSize), color,pixelSize); + draw_intra_pred_mode(srcimg,img,stride,xb+HalfCbSize,yb+HalfCbSize,log2CbSize-1, + srcimg->get_IntraPredMode(xb+HalfCbSize,yb+HalfCbSize), color,pixelSize); + break; + default: + assert(false); + break; + } + } + } + } +} + + +LIBDE265_API void draw_CB_grid(const de265_image* img, uint8_t* dst, int stride, uint32_t color,int pixelSize) +{ + draw_tree_grid(img,dst,stride,color,pixelSize, Partitioning_CB); +} + +LIBDE265_API void draw_TB_grid(const de265_image* img, uint8_t* dst, int stride, uint32_t color,int pixelSize) +{ + draw_tree_grid(img,dst,stride,color,pixelSize, Partitioning_TB); +} + +LIBDE265_API void draw_PB_grid(const de265_image* img, uint8_t* dst, int stride, uint32_t color,int pixelSize) +{ + draw_tree_grid(img,dst,stride,color,pixelSize, Partitioning_PB); +} + +LIBDE265_API void draw_intra_pred_modes(const de265_image* img, uint8_t* dst, int stride, uint32_t color,int pixelSize) +{ + draw_tree_grid(img,dst,stride,color,pixelSize, IntraPredMode); +} + +LIBDE265_API void draw_PB_pred_modes(const de265_image* img, uint8_t* dst, int stride, int pixelSize) +{ + draw_tree_grid(img,dst,stride,0,pixelSize, PBPredMode); +} + +LIBDE265_API void draw_QuantPY(const de265_image* img, uint8_t* dst, int stride, int pixelSize) +{ + draw_tree_grid(img,dst,stride,0,pixelSize, QuantP_Y); +} + +LIBDE265_API void draw_Motion(const de265_image* img, uint8_t* dst, int stride, int pixelSize) +{ + draw_tree_grid(img,dst,stride,0,pixelSize, PBMotionVectors); +} + +LIBDE265_API void draw_Slices(const de265_image* img, uint8_t* dst, int stride, int pixelSize) +{ + const seq_parameter_set& sps = img->get_sps(); + + // --- mark first CTB in slice (red - independent / green - dependent) --- + + for (int ctby=0;ctby0 || ctby>0) { prevCtbRS = img->get_pps().CtbAddrTStoRS[ img->get_pps().CtbAddrRStoTS[ctbAddrRS] -1 ]; } + + if (prevCtbRS<0 || + img->get_SliceHeaderIndex_atIndex(ctbAddrRS) != + img->get_SliceHeaderIndex_atIndex(prevCtbRS)) { + int step=2; + int fillcolor = 0xFF0000; + + if (img->get_SliceHeaderCtb(ctbx,ctby)->dependent_slice_segment_flag) { + step=2; + fillcolor = 0x00FF00; + } + + for (int x=0;x<1<0 && (img->get_SliceHeaderIndexCtb(ctbx ,ctby) != + img->get_SliceHeaderIndexCtb(ctbx-1,ctby))) { + int x = ctbx << sps.Log2CtbSizeY; + int y0 = ctby << sps.Log2CtbSizeY; + + for (int y=y0; + (y0 && (img->get_SliceHeaderIndexCtb(ctbx,ctby ) != + img->get_SliceHeaderIndexCtb(ctbx,ctby-1))) { + int x0 = ctbx << sps.Log2CtbSizeY; + int y = ctby << sps.Log2CtbSizeY; + + for (int x=x0 ; + (xget_sps(); + const pic_parameter_set& pps = img->get_pps(); + + + for (int tx=1;tx + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "vps.h" +#include "util.h" +#include "decctx.h" + +#include + + +void profile_data::set_defaults(enum profile_idc profile, int level_major, int level_minor) +{ + profile_present_flag = 1; + + profile_space = 0; + tier_flag = 0; + profile_idc = profile; + + for (int i=0;i<32;i++) { + profile_compatibility_flag[i]=0; + } + + switch (profile) { + case Profile_Main: + profile_compatibility_flag[Profile_Main]=1; + profile_compatibility_flag[Profile_Main10]=1; + break; + case Profile_Main10: + profile_compatibility_flag[Profile_Main10]=1; + break; + default: + assert(0); + } + + progressive_source_flag = 0; + interlaced_source_flag = 0; + non_packed_constraint_flag = 0; + frame_only_constraint_flag = 0; + + + // --- level --- + + level_present_flag = 1; + level_idc = level_major*30 + level_minor*3; +} + + +void video_parameter_set::set_defaults(enum profile_idc profile, int level_major, int level_minor) +{ + video_parameter_set_id = 0; + vps_max_layers = 1; // always =1 in current version of standard + vps_max_sub_layers = 1; // temporal sub-layers + vps_temporal_id_nesting_flag = 1; + + profile_tier_level_.general.set_defaults(profile,level_major,level_minor); + + vps_sub_layer_ordering_info_present_flag = 0; + layer[0].vps_max_dec_pic_buffering = 1; + layer[0].vps_max_num_reorder_pics = 0; + layer[0].vps_max_latency_increase = 0; + + vps_max_layer_id = 0; + vps_num_layer_sets = 1; + + layer_id_included_flag.resize(vps_num_layer_sets); + + + // --- timing info --- + + vps_timing_info_present_flag = 0; + vps_num_units_in_tick = 0; + vps_time_scale = 0; + vps_poc_proportional_to_timing_flag = 0; + + vps_num_ticks_poc_diff_one = 0; + vps_num_hrd_parameters = 0; + + + // --- vps extension --- + + vps_extension_flag = 0; +} + + +de265_error video_parameter_set::read(error_queue* errqueue, bitreader* reader) +{ + int vlc; + + video_parameter_set_id = vlc = get_bits(reader, 4); + if (vlc >= DE265_MAX_VPS_SETS) return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + + skip_bits(reader, 2); + vps_max_layers = vlc = get_bits(reader,6) +1; + if (vlc > 63) return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; // vps_max_layers_minus1 (range 0...63) + + vps_max_sub_layers = vlc = get_bits(reader,3) +1; + if (vlc >= MAX_TEMPORAL_SUBLAYERS) return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + + vps_temporal_id_nesting_flag = get_bits(reader,1); + skip_bits(reader, 16); + + profile_tier_level_.read(reader, vps_max_sub_layers); + + /* + read_bit_rate_pic_rate_info(reader, &bit_rate_pic_rate_info, + 0, vps_max_sub_layers-1); + */ + + vps_sub_layer_ordering_info_present_flag = get_bits(reader,1); + //assert(vps_max_sub_layers-1 < MAX_TEMPORAL_SUBLAYERS); + + int firstLayerRead = vps_sub_layer_ordering_info_present_flag ? 0 : (vps_max_sub_layers-1); + + for (int i=firstLayerRead;i=1024 || + vps_num_layer_sets == UVLC_ERROR) { + errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + vps_num_layer_sets += 1; + + layer_id_included_flag.resize(vps_num_layer_sets); + + for (int i=1; i <= vps_num_layer_sets-1; i++) + { + layer_id_included_flag[i].resize(vps_max_layer_id+1); + + for (int j=0; j <= vps_max_layer_id; j++) + { + layer_id_included_flag[i][j] = get_bits(reader,1); + } + } + + vps_timing_info_present_flag = get_bits(reader,1); + + if (vps_timing_info_present_flag) { + vps_num_units_in_tick = get_bits(reader,32); + vps_time_scale = get_bits(reader,32); + vps_poc_proportional_to_timing_flag = get_bits(reader,1); + + if (vps_poc_proportional_to_timing_flag) { + vps_num_ticks_poc_diff_one = get_uvlc(reader)+1; + vps_num_hrd_parameters = get_uvlc(reader); + + if (vps_num_hrd_parameters >= 1024 || vps_num_hrd_parameters < 0) { + errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + + hrd_layer_set_idx .resize(vps_num_hrd_parameters); + cprms_present_flag.resize(vps_num_hrd_parameters); + + for (int i=0; i 0) { + cprms_present_flag[i] = get_bits(reader,1); + } + + //hrd_parameters(cprms_present_flag[i], vps_max_sub_layers_minus1) + + return DE265_OK; // TODO: decode hrd_parameters() + } + } + } + + vps_extension_flag = get_bits(reader,1); + + if (vps_extension_flag) { + /* + while( more_rbsp_data() ) + vps_extension_data_flag u(1) + rbsp_trailing_bits() + */ + } + + return DE265_OK; +} + + +de265_error video_parameter_set::write(error_queue* errqueue, CABAC_encoder& out) const +{ + if (video_parameter_set_id >= DE265_MAX_VPS_SETS) return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + out.write_bits(video_parameter_set_id,4); + + out.write_bits(0x3,2); + out.write_bits(vps_max_layers-1,6); + + if (vps_max_sub_layers >= MAX_TEMPORAL_SUBLAYERS) return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + out.write_bits(vps_max_sub_layers-1,3); + + out.write_bit(vps_temporal_id_nesting_flag); + out.write_bits(0xFFFF, 16); + + profile_tier_level_.write(out, vps_max_sub_layers); + + /* + read_bit_rate_pic_rate_info(reader, &bit_rate_pic_rate_info, + 0, vps_max_sub_layers-1); + */ + + out.write_bit(vps_sub_layer_ordering_info_present_flag); + //assert(vps_max_sub_layers-1 < MAX_TEMPORAL_SUBLAYERS); + + int firstLayerRead = vps_sub_layer_ordering_info_present_flag ? 0 : (vps_max_sub_layers-1); + + for (int i=firstLayerRead;i=1024) { + errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + + out.write_bits(vps_max_layer_id,6); + out.write_uvlc(vps_num_layer_sets-1); + + for (int i=1; i <= vps_num_layer_sets-1; i++) + for (int j=0; j <= vps_max_layer_id; j++) + { + out.write_bit(layer_id_included_flag[i][j]); + } + + out.write_bit(vps_timing_info_present_flag); + + if (vps_timing_info_present_flag) { + out.write_bits(vps_num_units_in_tick,32); + out.write_bits(vps_time_scale ,32); + out.write_bit (vps_poc_proportional_to_timing_flag); + + if (vps_poc_proportional_to_timing_flag) { + out.write_uvlc(vps_num_ticks_poc_diff_one-1); + out.write_uvlc(vps_num_hrd_parameters); + + for (int i=0; i 0) { + out.write_bit(cprms_present_flag[i]); + } + + //hrd_parameters(cprms_present_flag[i], vps_max_sub_layers_minus1) + + return DE265_OK; // TODO: decode hrd_parameters() + } + } + } + + out.write_bit(vps_extension_flag); + + if (vps_extension_flag) { + /* + while( more_rbsp_data() ) + vps_extension_data_flag u(1) + rbsp_trailing_bits() + */ + } + + return DE265_OK; +} + + +void profile_data::read(bitreader* reader) +{ + if (profile_present_flag) { + profile_space = get_bits(reader,2); + tier_flag = get_bits(reader,1); + profile_idc = (enum profile_idc)get_bits(reader,5); + + for (int i=0; i<32; i++) { + profile_compatibility_flag[i] = get_bits(reader,1); + } + + progressive_source_flag = get_bits(reader,1); + interlaced_source_flag = get_bits(reader,1); + non_packed_constraint_flag = get_bits(reader,1); + frame_only_constraint_flag = get_bits(reader,1); + skip_bits(reader,44); + } + + if (level_present_flag) { + level_idc = get_bits(reader,8); + } +} + + +void profile_tier_level::read(bitreader* reader, + int max_sub_layers) +{ + // --- read the general profile --- + + general.profile_present_flag = true; + general.level_present_flag = true; + general.read(reader); + + + // --- read the profile/levels of the sub-layers --- + + for (int i=0; i 1) + { + for (int i=max_sub_layers-1; i<8; i++) + { + skip_bits(reader,2); + } + } + + for (int i=0; i 1) + { + for (int i=max_sub_layers-1; i<8; i++) + { + out.skip_bits(2); + } + } + + for (int i=0; ibit_rate_info_present_flag[i] = get_bits(reader,1); + hdr->pic_rate_info_present_flag[i] = get_bits(reader,1); + + if (hdr->bit_rate_info_present_flag[i]) { + hdr->avg_bit_rate[i] = get_bits(reader,16); + hdr->max_bit_rate[i] = get_bits(reader,16); + } + + if (hdr->pic_rate_info_present_flag[i]) { + hdr->constant_pic_rate_idc[i] = get_bits(reader,2); + hdr->avg_pic_rate[i] = get_bits(reader,16); + } + } +} +*/ + + + +#define LOG0(t) log2fh(fh, t) +#define LOG1(t,d) log2fh(fh, t,d) +#define LOG2(t,d1,d2) log2fh(fh, t,d1,d2) +#define LOG3(t,d1,d2,d3) log2fh(fh, t,d1,d2,d3) + +void video_parameter_set::dump(int fd) const +{ + FILE* fh; + if (fd==1) fh=stdout; + else if (fd==2) fh=stderr; + else { return; } + + LOG0("----------------- VPS -----------------\n"); + LOG1("video_parameter_set_id : %d\n", video_parameter_set_id); + LOG1("vps_max_layers : %d\n", vps_max_layers); + LOG1("vps_max_sub_layers : %d\n", vps_max_sub_layers); + LOG1("vps_temporal_id_nesting_flag : %d\n", vps_temporal_id_nesting_flag); + + profile_tier_level_.dump(vps_max_sub_layers, fh); + //dump_bit_rate_pic_rate_info(&bit_rate_pic_rate_info, 0, vps_max_sub_layers-1); + + LOG1("vps_sub_layer_ordering_info_present_flag : %d\n", + vps_sub_layer_ordering_info_present_flag); + + if (vps_sub_layer_ordering_info_present_flag) { + for (int i=0;i 0) { + LOG2("cprms_present_flag[%d] = %d\n", i, cprms_present_flag[i]); + } + + //hrd_parameters(cprms_present_flag[i], vps_max_sub_layers_minus1) + + return; // TODO: decode hrd_parameters() + } + } + } + + LOG1("vps_extension_flag = %d\n", vps_extension_flag); +} + + +static const char* profile_name(profile_idc p) +{ + switch (p) { + case Profile_Main: return "Main"; + case Profile_Main10: return "Main10"; + case Profile_MainStillPicture: return "MainStillPicture"; + case Profile_FormatRangeExtensions: return "FormatRangeExtensions"; + default: + return "(unknown)"; + } +} + + +void profile_data::dump(bool general, FILE* fh) const +{ + const char* prefix = (general ? "general" : "sub_layer"); + + if (profile_present_flag) { + LOG2(" %s_profile_space : %d\n", prefix,profile_space); + LOG2(" %s_tier_flag : %d\n", prefix,tier_flag); + LOG2(" %s_profile_idc : %s\n", prefix, profile_name(profile_idc)); + + LOG1(" %s_profile_compatibility_flags: ", prefix); + for (int i=0; i<32; i++) { + if (i) LOG0("*,"); + LOG1("*%d",profile_compatibility_flag[i]); + } + LOG0("*\n"); + LOG2(" %s_progressive_source_flag : %d\n",prefix,progressive_source_flag); + LOG2(" %s_interlaced_source_flag : %d\n",prefix,interlaced_source_flag); + LOG2(" %s_non_packed_constraint_flag : %d\n",prefix,non_packed_constraint_flag); + LOG2(" %s_frame_only_constraint_flag : %d\n",prefix,frame_only_constraint_flag); + } + + if (level_present_flag) { + LOG3(" %s_level_idc : %d (%4.2f)\n", prefix,level_idc, level_idc/30.0f); + } +} + + +void profile_tier_level::dump(int max_sub_layers, FILE* fh) const +{ + general.dump(true, fh); + + for (int i=0; ibit_rate_info_present_flag[i]) { + LOG(" avg_bit_rate : %d\n", hdr->avg_bit_rate[i]); + LOG(" max_bit_rate : %d\n", hdr->max_bit_rate[i]); + } + + if (hdr->pic_rate_info_present_flag[i]) { + LOG(" constant_pic_rate_idc : %d\n", hdr->constant_pic_rate_idc[i]); + LOG(" avg_pic_rate[i] : %d\n", hdr->avg_pic_rate[i]); + } + } +} +*/ diff --git a/vui.cc b/vui.cc new file mode 100644 index 0000000..5524fa8 --- /dev/null +++ b/vui.cc @@ -0,0 +1,425 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "vui.h" +#include "decctx.h" + +#include +#include +#include + +#define READ_VLC_OFFSET(variable, vlctype, offset) \ + if ((vlc = get_ ## vlctype(br)) == UVLC_ERROR) { \ + errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); \ + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; \ + } \ + variable = vlc + offset; + +#define READ_VLC(variable, vlctype) READ_VLC_OFFSET(variable,vlctype,0) + + +#define NUM_SAR_PRESETS 17 + +static uint16_t sar_presets[NUM_SAR_PRESETS+1][2] = { + { 0,0 }, + { 1,1 }, + { 12,11 }, + { 10,11 }, + { 16,11 }, + { 40,33 }, + { 24,11 }, + { 20,11 }, + { 32,11 }, + { 80,33 }, + { 18,11 }, + { 15,11 }, + { 64,33 }, + { 160,99 }, + { 4,3 }, + { 3,2 }, + { 2,1 } +}; + +#define EXTENDED_SAR 255 + + +const char* get_video_format_name(enum VideoFormat format) +{ + switch (format) { + case VideoFormat_Component: return "component"; + case VideoFormat_PAL: return "PAL"; + case VideoFormat_NTSC: return "NTSC"; + case VideoFormat_SECAM: return "SECAM"; + case VideoFormat_MAC: return "MAC"; + default: return "unspecified"; + } +} + + +video_usability_information::video_usability_information() +{ + aspect_ratio_info_present_flag = false; + sar_width = 0; + sar_height = 0; + + + // --- overscan --- + + overscan_info_present_flag = false; + overscan_appropriate_flag = false; + + + // --- video signal type --- + + video_signal_type_present_flag = false; + video_format = VideoFormat_Unspecified; + video_full_range_flag = false; + colour_description_present_flag = false; + colour_primaries = 2; + transfer_characteristics = 2; + matrix_coeffs = 2; + + // --- chroma / interlaced --- + + chroma_loc_info_present_flag = false; + chroma_sample_loc_type_top_field = 0; + chroma_sample_loc_type_bottom_field = 0; + + neutral_chroma_indication_flag = false; + field_seq_flag = false; + frame_field_info_present_flag = false; + + // --- default display window --- + + default_display_window_flag = false; + def_disp_win_left_offset = 0; + def_disp_win_right_offset = 0; + def_disp_win_top_offset = 0; + def_disp_win_bottom_offset = 0; + + + // --- timing --- + + vui_timing_info_present_flag = false; + vui_num_units_in_tick = 0; + vui_time_scale = 0; + + vui_poc_proportional_to_timing_flag = false; + vui_num_ticks_poc_diff_one = 1; + + + // --- hrd parameters --- + + vui_hrd_parameters_present_flag = false; + //hrd_parameters vui_hrd_parameters; + + + // --- bitstream restriction --- + + bitstream_restriction_flag = false; + tiles_fixed_structure_flag = false; + motion_vectors_over_pic_boundaries_flag = true; + restricted_ref_pic_lists_flag = false; + min_spatial_segmentation_idc = 0; + max_bytes_per_pic_denom = 2; + max_bits_per_min_cu_denom = 1; + log2_max_mv_length_horizontal = 15; + log2_max_mv_length_vertical = 15; +} + + +de265_error video_usability_information::read(error_queue* errqueue, bitreader* br, + const seq_parameter_set* sps) +{ + int vlc; + + + // --- sample aspect ratio (SAR) --- + + aspect_ratio_info_present_flag = get_bits(br,1); + if (aspect_ratio_info_present_flag) { + int aspect_ratio_idc = get_bits(br,8); + if (aspect_ratio_idc <= NUM_SAR_PRESETS) { + sar_width = sar_presets[aspect_ratio_idc][0]; + sar_height = sar_presets[aspect_ratio_idc][1]; + } + else if (aspect_ratio_idc == EXTENDED_SAR) { + sar_width = get_bits(br,16); + sar_height = get_bits(br,16); + } + else { + sar_width = 0; + sar_height = 0; + } + } + else { + sar_width = 0; + sar_height = 0; + } + + + // --- overscan --- + + overscan_info_present_flag = get_bits(br,1); + if (overscan_info_present_flag) { + overscan_appropriate_flag = get_bits(br,1); + } + + + // --- video signal type --- + + { // defaults + video_format = VideoFormat_Unspecified; + video_full_range_flag = false; + colour_primaries = 2; + transfer_characteristics = 2; + matrix_coeffs = 2; + } + + video_signal_type_present_flag = get_bits(br,1); + if (video_signal_type_present_flag) { + int video_format_idc = get_bits(br,3); + if (video_format_idc > 5) { + video_format_idc = VideoFormat_Unspecified; + } + video_format = (VideoFormat)video_format_idc; + + video_full_range_flag = get_bits(br,1); + + colour_description_present_flag = get_bits(br,1); + if (colour_description_present_flag) { + colour_primaries = get_bits(br,8); + if (colour_primaries == 0 || + colour_primaries == 3 || + colour_primaries >= 11) { + colour_primaries = 2; + } + + transfer_characteristics = get_bits(br,8); + if (transfer_characteristics == 0 || + transfer_characteristics == 3 || + transfer_characteristics >= 18) { + transfer_characteristics = 2; + } + + matrix_coeffs = get_bits(br,8); + if (matrix_coeffs == 0 || + matrix_coeffs >= 11) { + matrix_coeffs = 2; + } + } + } + + + // --- chroma / interlaced --- + + chroma_loc_info_present_flag = get_bits(br,1); + if (chroma_loc_info_present_flag) { + READ_VLC(chroma_sample_loc_type_top_field, uvlc); + READ_VLC(chroma_sample_loc_type_bottom_field, uvlc); + } + else { + chroma_sample_loc_type_top_field = 0; + chroma_sample_loc_type_bottom_field = 0; + } + + neutral_chroma_indication_flag = get_bits(br,1); + field_seq_flag = get_bits(br,1); + frame_field_info_present_flag = get_bits(br,1); + + + // --- default display window --- + + default_display_window_flag = get_bits(br,1); + if (default_display_window_flag) { + READ_VLC(def_disp_win_left_offset ,uvlc); + READ_VLC(def_disp_win_right_offset ,uvlc); + READ_VLC(def_disp_win_top_offset ,uvlc); + READ_VLC(def_disp_win_bottom_offset,uvlc); + } + else { + def_disp_win_left_offset =0; + def_disp_win_right_offset =0; + def_disp_win_top_offset =0; + def_disp_win_bottom_offset=0; + } + + + // --- timing --- + + vui_timing_info_present_flag = get_bits(br,1); + if (vui_timing_info_present_flag) { + vui_num_units_in_tick = get_bits(br,32); + vui_time_scale = get_bits(br,32); + } + + vui_poc_proportional_to_timing_flag = get_bits(br,1); + READ_VLC_OFFSET(vui_num_ticks_poc_diff_one, uvlc, 1); + + + // --- hrd parameters --- + + vui_hrd_parameters_present_flag = get_bits(br,1); + if (vui_hrd_parameters_present_flag) { + return DE265_ERROR_NOT_IMPLEMENTED_YET; + //hrd_parameters vui_hrd_parameters; + } + + + // --- bitstream restriction --- + + bitstream_restriction_flag = get_bits(br,1); + if (bitstream_restriction_flag) { + tiles_fixed_structure_flag = get_bits(br,1); + motion_vectors_over_pic_boundaries_flag = get_bits(br,1); + restricted_ref_pic_lists_flag = get_bits(br,1); + + READ_VLC(min_spatial_segmentation_idc, uvlc); + if (min_spatial_segmentation_idc > 4095) { + errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); + min_spatial_segmentation_idc = 0; + } + + READ_VLC(max_bytes_per_pic_denom, uvlc); + if (max_bytes_per_pic_denom > 16) { + errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); + max_bytes_per_pic_denom = 2; + } + + READ_VLC(max_bits_per_min_cu_denom, uvlc); + if (max_bits_per_min_cu_denom > 16) { + errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); + max_bits_per_min_cu_denom = 1; + } + + READ_VLC(log2_max_mv_length_horizontal, uvlc); + if (log2_max_mv_length_horizontal > 15) { + errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); + log2_max_mv_length_horizontal = 15; + } + + READ_VLC(log2_max_mv_length_vertical, uvlc); + if (log2_max_mv_length_vertical > 15) { + errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); + log2_max_mv_length_vertical = 15; + } + } + else { + tiles_fixed_structure_flag = false; + motion_vectors_over_pic_boundaries_flag = true; + restricted_ref_pic_lists_flag = false; // NOTE: default not specified in standard 2014/10 + + min_spatial_segmentation_idc = 0; + max_bytes_per_pic_denom = 2; + max_bits_per_min_cu_denom = 1; + log2_max_mv_length_horizontal = 15; + log2_max_mv_length_vertical = 15; + } + + //vui_read = true; + + return DE265_OK; +} + + +void video_usability_information::dump(int fd) const +{ + //#if (_MSC_VER >= 1500) + //#define LOG0(t) loginfo(LogHeaders, t) + //#define LOG1(t,d) loginfo(LogHeaders, t,d) + //#define LOG2(t,d1,d2) loginfo(LogHeaders, t,d1,d2) + //#define LOG3(t,d1,d2,d3) loginfo(LogHeaders, t,d1,d2,d3) + + FILE* fh; + if (fd==1) fh=stdout; + else if (fd==2) fh=stderr; + else { return; } + +#define LOG0(t) log2fh(fh, t) +#define LOG1(t,d) log2fh(fh, t,d) +#define LOG2(t,d1,d2) log2fh(fh, t,d1,d2) +#define LOG3(t,d1,d2,d3) log2fh(fh, t,d1,d2,d3) + + LOG0("----------------- VUI -----------------\n"); + LOG2("sample aspect ratio : %d:%d\n", sar_width,sar_height); + LOG1("overscan_info_present_flag : %d\n", overscan_info_present_flag); + LOG1("overscan_appropriate_flag : %d\n", overscan_appropriate_flag); + + LOG1("video_signal_type_present_flag: %d\n", video_signal_type_present_flag); + if (video_signal_type_present_flag) { + LOG1(" video_format : %s\n", get_video_format_name(video_format)); + LOG1(" video_full_range_flag : %d\n", video_full_range_flag); + LOG1(" colour_description_present_flag : %d\n", colour_description_present_flag); + LOG1(" colour_primaries : %d\n", colour_primaries); + LOG1(" transfer_characteristics : %d\n", transfer_characteristics); + LOG1(" matrix_coeffs : %d\n", matrix_coeffs); + } + + LOG1("chroma_loc_info_present_flag: %d\n", chroma_loc_info_present_flag); + if (chroma_loc_info_present_flag) { + LOG1(" chroma_sample_loc_type_top_field : %d\n", chroma_sample_loc_type_top_field); + LOG1(" chroma_sample_loc_type_bottom_field: %d\n", chroma_sample_loc_type_bottom_field); + } + + LOG1("neutral_chroma_indication_flag: %d\n", neutral_chroma_indication_flag); + LOG1("field_seq_flag : %d\n", field_seq_flag); + LOG1("frame_field_info_present_flag : %d\n", frame_field_info_present_flag); + + LOG1("default_display_window_flag : %d\n", default_display_window_flag); + LOG1(" def_disp_win_left_offset : %d\n", def_disp_win_left_offset); + LOG1(" def_disp_win_right_offset : %d\n", def_disp_win_right_offset); + LOG1(" def_disp_win_top_offset : %d\n", def_disp_win_top_offset); + LOG1(" def_disp_win_bottom_offset : %d\n", def_disp_win_bottom_offset); + + LOG1("vui_timing_info_present_flag : %d\n", vui_timing_info_present_flag); + if (vui_timing_info_present_flag) { + LOG1(" vui_num_units_in_tick : %d\n", vui_num_units_in_tick); + LOG1(" vui_time_scale : %d\n", vui_time_scale); + } + + LOG1("vui_poc_proportional_to_timing_flag : %d\n", vui_poc_proportional_to_timing_flag); + LOG1("vui_num_ticks_poc_diff_one : %d\n", vui_num_ticks_poc_diff_one); + + LOG1("vui_hrd_parameters_present_flag : %d\n", vui_hrd_parameters_present_flag); + if (vui_hrd_parameters_present_flag) { + //hrd_parameters vui_hrd_parameters; + } + + + // --- bitstream restriction --- + + LOG1("bitstream_restriction_flag : %d\n", bitstream_restriction_flag); + if (bitstream_restriction_flag) { + LOG1(" tiles_fixed_structure_flag : %d\n", tiles_fixed_structure_flag); + LOG1(" motion_vectors_over_pic_boundaries_flag : %d\n", motion_vectors_over_pic_boundaries_flag); + LOG1(" restricted_ref_pic_lists_flag : %d\n", restricted_ref_pic_lists_flag); + LOG1(" min_spatial_segmentation_idc : %d\n", min_spatial_segmentation_idc); + LOG1(" max_bytes_per_pic_denom : %d\n", max_bytes_per_pic_denom); + LOG1(" max_bits_per_min_cu_denom : %d\n", max_bits_per_min_cu_denom); + LOG1(" log2_max_mv_length_horizontal : %d\n", log2_max_mv_length_horizontal); + LOG1(" log2_max_mv_length_vertical : %d\n", log2_max_mv_length_vertical); + } + +#undef LOG0 +#undef LOG1 +#undef LOG2 +#undef LOG3 + //#endif +} diff --git a/x86/.deps/libde265_x86_la-sse.Plo b/x86/.deps/libde265_x86_la-sse.Plo new file mode 100644 index 0000000..d9c963b --- /dev/null +++ b/x86/.deps/libde265_x86_la-sse.Plo @@ -0,0 +1,64 @@ +libde265_x86_la-sse.lo: sse.cc /usr/include/stdc-predef.h \ + ../../libde265/x86/sse.h ../../libde265/acceleration.h \ + /usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/stddef.h \ + /usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/stdint.h \ + /usr/include/stdint.h /usr/include/bits/libc-header-start.h \ + /usr/include/features.h /usr/include/sys/cdefs.h \ + /usr/include/bits/wordsize.h /usr/include/bits/long-double.h \ + /usr/include/gnu/stubs.h /usr/include/gnu/stubs-64.h \ + /usr/include/bits/types.h /usr/include/bits/timesize.h \ + /usr/include/bits/typesizes.h /usr/include/bits/time64.h \ + /usr/include/bits/wchar.h /usr/include/bits/stdint-intn.h \ + /usr/include/bits/stdint-uintn.h /usr/include/assert.h \ + ../../libde265/x86/sse-motion.h ../../libde265/x86/sse-dct.h \ + ../../config.h /usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/cpuid.h + +/usr/include/stdc-predef.h: + +../../libde265/x86/sse.h: + +../../libde265/acceleration.h: + +/usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/stddef.h: + +/usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/stdint.h: + +/usr/include/stdint.h: + +/usr/include/bits/libc-header-start.h: + +/usr/include/features.h: + +/usr/include/sys/cdefs.h: + +/usr/include/bits/wordsize.h: + +/usr/include/bits/long-double.h: + +/usr/include/gnu/stubs.h: + +/usr/include/gnu/stubs-64.h: + +/usr/include/bits/types.h: + +/usr/include/bits/timesize.h: + +/usr/include/bits/typesizes.h: + +/usr/include/bits/time64.h: + +/usr/include/bits/wchar.h: + +/usr/include/bits/stdint-intn.h: + +/usr/include/bits/stdint-uintn.h: + +/usr/include/assert.h: + +../../libde265/x86/sse-motion.h: + +../../libde265/x86/sse-dct.h: + +../../config.h: + +/usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/cpuid.h: diff --git a/x86/.deps/libde265_x86_sse_la-sse-dct.Plo b/x86/.deps/libde265_x86_sse_la-sse-dct.Plo new file mode 100644 index 0000000..418d8f2 --- /dev/null +++ b/x86/.deps/libde265_x86_sse_la-sse-dct.Plo @@ -0,0 +1,431 @@ +libde265_x86_sse_la-sse-dct.lo: sse-dct.cc /usr/include/stdc-predef.h \ + ../../libde265/x86/sse-dct.h \ + /usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/stddef.h \ + /usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/stdint.h \ + /usr/include/stdint.h /usr/include/bits/libc-header-start.h \ + /usr/include/features.h /usr/include/sys/cdefs.h \ + /usr/include/bits/wordsize.h /usr/include/bits/long-double.h \ + /usr/include/gnu/stubs.h /usr/include/gnu/stubs-64.h \ + /usr/include/bits/types.h /usr/include/bits/timesize.h \ + /usr/include/bits/typesizes.h /usr/include/bits/time64.h \ + /usr/include/bits/wchar.h /usr/include/bits/stdint-intn.h \ + /usr/include/bits/stdint-uintn.h ../../libde265/util.h ../../config.h \ + /usr/include/inttypes.h /usr/include/stdio.h \ + /usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/stdarg.h \ + /usr/include/bits/types/__fpos_t.h /usr/include/bits/types/__mbstate_t.h \ + /usr/include/bits/types/__fpos64_t.h /usr/include/bits/types/__FILE.h \ + /usr/include/bits/types/FILE.h /usr/include/bits/types/struct_FILE.h \ + /usr/include/bits/types/cookie_io_functions_t.h \ + /usr/include/bits/stdio_lim.h /usr/include/bits/sys_errlist.h \ + /usr/include/bits/stdio.h /usr/include/c++/9.2.0/string \ + /usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/c++config.h \ + /usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/os_defines.h \ + /usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/cpu_defines.h \ + /usr/include/c++/9.2.0/bits/stringfwd.h \ + /usr/include/c++/9.2.0/bits/memoryfwd.h \ + /usr/include/c++/9.2.0/bits/char_traits.h \ + /usr/include/c++/9.2.0/bits/stl_algobase.h \ + /usr/include/c++/9.2.0/bits/functexcept.h \ + /usr/include/c++/9.2.0/bits/exception_defines.h \ + /usr/include/c++/9.2.0/bits/cpp_type_traits.h \ + /usr/include/c++/9.2.0/ext/type_traits.h \ + /usr/include/c++/9.2.0/ext/numeric_traits.h \ + /usr/include/c++/9.2.0/bits/stl_pair.h \ + /usr/include/c++/9.2.0/bits/move.h \ + /usr/include/c++/9.2.0/bits/concept_check.h \ + /usr/include/c++/9.2.0/type_traits \ + /usr/include/c++/9.2.0/bits/stl_iterator_base_types.h \ + /usr/include/c++/9.2.0/bits/stl_iterator_base_funcs.h \ + /usr/include/c++/9.2.0/debug/assertions.h \ + /usr/include/c++/9.2.0/bits/stl_iterator.h \ + /usr/include/c++/9.2.0/bits/ptr_traits.h \ + /usr/include/c++/9.2.0/debug/debug.h \ + /usr/include/c++/9.2.0/bits/predefined_ops.h \ + /usr/include/c++/9.2.0/bits/postypes.h /usr/include/c++/9.2.0/cwchar \ + /usr/include/wchar.h /usr/include/bits/floatn.h \ + /usr/include/bits/floatn-common.h /usr/include/bits/types/wint_t.h \ + /usr/include/bits/types/mbstate_t.h /usr/include/bits/types/locale_t.h \ + /usr/include/bits/types/__locale_t.h /usr/include/c++/9.2.0/cstdint \ + /usr/include/c++/9.2.0/bits/allocator.h \ + /usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/c++allocator.h \ + /usr/include/c++/9.2.0/ext/new_allocator.h /usr/include/c++/9.2.0/new \ + /usr/include/c++/9.2.0/exception /usr/include/c++/9.2.0/bits/exception.h \ + /usr/include/c++/9.2.0/bits/exception_ptr.h \ + /usr/include/c++/9.2.0/bits/cxxabi_init_exception.h \ + /usr/include/c++/9.2.0/typeinfo /usr/include/c++/9.2.0/bits/hash_bytes.h \ + /usr/include/c++/9.2.0/bits/nested_exception.h \ + /usr/include/c++/9.2.0/bits/localefwd.h \ + /usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/c++locale.h \ + /usr/include/c++/9.2.0/clocale /usr/include/locale.h \ + /usr/include/bits/locale.h /usr/include/c++/9.2.0/iosfwd \ + /usr/include/c++/9.2.0/cctype /usr/include/ctype.h /usr/include/endian.h \ + /usr/include/bits/endian.h /usr/include/bits/byteswap.h \ + /usr/include/bits/uintn-identity.h \ + /usr/include/c++/9.2.0/bits/ostream_insert.h \ + /usr/include/c++/9.2.0/bits/cxxabi_forced.h \ + /usr/include/c++/9.2.0/bits/stl_function.h \ + /usr/include/c++/9.2.0/backward/binders.h \ + /usr/include/c++/9.2.0/bits/range_access.h \ + /usr/include/c++/9.2.0/initializer_list \ + /usr/include/c++/9.2.0/bits/basic_string.h \ + /usr/include/c++/9.2.0/ext/atomicity.h \ + /usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/gthr.h \ + /usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/gthr-default.h \ + /usr/include/pthread.h /usr/include/sched.h \ + /usr/include/bits/types/time_t.h \ + /usr/include/bits/types/struct_timespec.h /usr/include/bits/sched.h \ + /usr/include/bits/types/struct_sched_param.h /usr/include/bits/cpu-set.h \ + /usr/include/time.h /usr/include/bits/time.h /usr/include/bits/timex.h \ + /usr/include/bits/types/struct_timeval.h \ + /usr/include/bits/types/clock_t.h /usr/include/bits/types/struct_tm.h \ + /usr/include/bits/types/clockid_t.h /usr/include/bits/types/timer_t.h \ + /usr/include/bits/types/struct_itimerspec.h \ + /usr/include/bits/pthreadtypes.h /usr/include/bits/thread-shared-types.h \ + /usr/include/bits/pthreadtypes-arch.h /usr/include/bits/setjmp.h \ + /usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/atomic_word.h \ + /usr/include/c++/9.2.0/ext/alloc_traits.h \ + /usr/include/c++/9.2.0/bits/alloc_traits.h \ + /usr/include/c++/9.2.0/ext/string_conversions.h \ + /usr/include/c++/9.2.0/cstdlib /usr/include/stdlib.h \ + /usr/include/bits/waitflags.h /usr/include/bits/waitstatus.h \ + /usr/include/sys/types.h /usr/include/sys/select.h \ + /usr/include/bits/select.h /usr/include/bits/types/sigset_t.h \ + /usr/include/bits/types/__sigset_t.h /usr/include/alloca.h \ + /usr/include/bits/stdlib-bsearch.h /usr/include/bits/stdlib-float.h \ + /usr/include/c++/9.2.0/bits/std_abs.h /usr/include/c++/9.2.0/cstdio \ + /usr/include/c++/9.2.0/cerrno /usr/include/errno.h \ + /usr/include/bits/errno.h /usr/include/linux/errno.h \ + /usr/include/asm/errno.h /usr/include/asm-generic/errno.h \ + /usr/include/asm-generic/errno-base.h /usr/include/bits/types/error_t.h \ + /usr/include/c++/9.2.0/bits/functional_hash.h \ + /usr/include/c++/9.2.0/bits/basic_string.tcc ../../libde265/de265.h \ + ../../libde265/de265-version.h \ + /usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/emmintrin.h \ + /usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/xmmintrin.h \ + /usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/mmintrin.h \ + /usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/mm_malloc.h \ + /usr/include/c++/9.2.0/stdlib.h \ + /usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/tmmintrin.h \ + /usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/pmmintrin.h \ + /usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/smmintrin.h \ + /usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/popcntintrin.h + +/usr/include/stdc-predef.h: + +../../libde265/x86/sse-dct.h: + +/usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/stddef.h: + +/usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/stdint.h: + +/usr/include/stdint.h: + +/usr/include/bits/libc-header-start.h: + +/usr/include/features.h: + +/usr/include/sys/cdefs.h: + +/usr/include/bits/wordsize.h: + +/usr/include/bits/long-double.h: + +/usr/include/gnu/stubs.h: + +/usr/include/gnu/stubs-64.h: + +/usr/include/bits/types.h: + +/usr/include/bits/timesize.h: + +/usr/include/bits/typesizes.h: + +/usr/include/bits/time64.h: + +/usr/include/bits/wchar.h: + +/usr/include/bits/stdint-intn.h: + +/usr/include/bits/stdint-uintn.h: + +../../libde265/util.h: + +../../config.h: + +/usr/include/inttypes.h: + +/usr/include/stdio.h: + +/usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/stdarg.h: + +/usr/include/bits/types/__fpos_t.h: + +/usr/include/bits/types/__mbstate_t.h: + +/usr/include/bits/types/__fpos64_t.h: + +/usr/include/bits/types/__FILE.h: + +/usr/include/bits/types/FILE.h: + +/usr/include/bits/types/struct_FILE.h: + +/usr/include/bits/types/cookie_io_functions_t.h: + +/usr/include/bits/stdio_lim.h: + +/usr/include/bits/sys_errlist.h: + +/usr/include/bits/stdio.h: + +/usr/include/c++/9.2.0/string: + +/usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/c++config.h: + +/usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/os_defines.h: + +/usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/cpu_defines.h: + +/usr/include/c++/9.2.0/bits/stringfwd.h: + +/usr/include/c++/9.2.0/bits/memoryfwd.h: + +/usr/include/c++/9.2.0/bits/char_traits.h: + +/usr/include/c++/9.2.0/bits/stl_algobase.h: + +/usr/include/c++/9.2.0/bits/functexcept.h: + +/usr/include/c++/9.2.0/bits/exception_defines.h: + +/usr/include/c++/9.2.0/bits/cpp_type_traits.h: + +/usr/include/c++/9.2.0/ext/type_traits.h: + +/usr/include/c++/9.2.0/ext/numeric_traits.h: + +/usr/include/c++/9.2.0/bits/stl_pair.h: + +/usr/include/c++/9.2.0/bits/move.h: + +/usr/include/c++/9.2.0/bits/concept_check.h: + +/usr/include/c++/9.2.0/type_traits: + +/usr/include/c++/9.2.0/bits/stl_iterator_base_types.h: + +/usr/include/c++/9.2.0/bits/stl_iterator_base_funcs.h: + +/usr/include/c++/9.2.0/debug/assertions.h: + +/usr/include/c++/9.2.0/bits/stl_iterator.h: + +/usr/include/c++/9.2.0/bits/ptr_traits.h: + +/usr/include/c++/9.2.0/debug/debug.h: + +/usr/include/c++/9.2.0/bits/predefined_ops.h: + +/usr/include/c++/9.2.0/bits/postypes.h: + +/usr/include/c++/9.2.0/cwchar: + +/usr/include/wchar.h: + +/usr/include/bits/floatn.h: + +/usr/include/bits/floatn-common.h: + +/usr/include/bits/types/wint_t.h: + +/usr/include/bits/types/mbstate_t.h: + +/usr/include/bits/types/locale_t.h: + +/usr/include/bits/types/__locale_t.h: + +/usr/include/c++/9.2.0/cstdint: + +/usr/include/c++/9.2.0/bits/allocator.h: + +/usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/c++allocator.h: + +/usr/include/c++/9.2.0/ext/new_allocator.h: + +/usr/include/c++/9.2.0/new: + +/usr/include/c++/9.2.0/exception: + +/usr/include/c++/9.2.0/bits/exception.h: + +/usr/include/c++/9.2.0/bits/exception_ptr.h: + +/usr/include/c++/9.2.0/bits/cxxabi_init_exception.h: + +/usr/include/c++/9.2.0/typeinfo: + +/usr/include/c++/9.2.0/bits/hash_bytes.h: + +/usr/include/c++/9.2.0/bits/nested_exception.h: + +/usr/include/c++/9.2.0/bits/localefwd.h: + +/usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/c++locale.h: + +/usr/include/c++/9.2.0/clocale: + +/usr/include/locale.h: + +/usr/include/bits/locale.h: + +/usr/include/c++/9.2.0/iosfwd: + +/usr/include/c++/9.2.0/cctype: + +/usr/include/ctype.h: + +/usr/include/endian.h: + +/usr/include/bits/endian.h: + +/usr/include/bits/byteswap.h: + +/usr/include/bits/uintn-identity.h: + +/usr/include/c++/9.2.0/bits/ostream_insert.h: + +/usr/include/c++/9.2.0/bits/cxxabi_forced.h: + +/usr/include/c++/9.2.0/bits/stl_function.h: + +/usr/include/c++/9.2.0/backward/binders.h: + +/usr/include/c++/9.2.0/bits/range_access.h: + +/usr/include/c++/9.2.0/initializer_list: + +/usr/include/c++/9.2.0/bits/basic_string.h: + +/usr/include/c++/9.2.0/ext/atomicity.h: + +/usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/gthr.h: + +/usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/gthr-default.h: + +/usr/include/pthread.h: + +/usr/include/sched.h: + +/usr/include/bits/types/time_t.h: + +/usr/include/bits/types/struct_timespec.h: + +/usr/include/bits/sched.h: + +/usr/include/bits/types/struct_sched_param.h: + +/usr/include/bits/cpu-set.h: + +/usr/include/time.h: + +/usr/include/bits/time.h: + +/usr/include/bits/timex.h: + +/usr/include/bits/types/struct_timeval.h: + +/usr/include/bits/types/clock_t.h: + +/usr/include/bits/types/struct_tm.h: + +/usr/include/bits/types/clockid_t.h: + +/usr/include/bits/types/timer_t.h: + +/usr/include/bits/types/struct_itimerspec.h: + +/usr/include/bits/pthreadtypes.h: + +/usr/include/bits/thread-shared-types.h: + +/usr/include/bits/pthreadtypes-arch.h: + +/usr/include/bits/setjmp.h: + +/usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/atomic_word.h: + +/usr/include/c++/9.2.0/ext/alloc_traits.h: + +/usr/include/c++/9.2.0/bits/alloc_traits.h: + +/usr/include/c++/9.2.0/ext/string_conversions.h: + +/usr/include/c++/9.2.0/cstdlib: + +/usr/include/stdlib.h: + +/usr/include/bits/waitflags.h: + +/usr/include/bits/waitstatus.h: + +/usr/include/sys/types.h: + +/usr/include/sys/select.h: + +/usr/include/bits/select.h: + +/usr/include/bits/types/sigset_t.h: + +/usr/include/bits/types/__sigset_t.h: + +/usr/include/alloca.h: + +/usr/include/bits/stdlib-bsearch.h: + +/usr/include/bits/stdlib-float.h: + +/usr/include/c++/9.2.0/bits/std_abs.h: + +/usr/include/c++/9.2.0/cstdio: + +/usr/include/c++/9.2.0/cerrno: + +/usr/include/errno.h: + +/usr/include/bits/errno.h: + +/usr/include/linux/errno.h: + +/usr/include/asm/errno.h: + +/usr/include/asm-generic/errno.h: + +/usr/include/asm-generic/errno-base.h: + +/usr/include/bits/types/error_t.h: + +/usr/include/c++/9.2.0/bits/functional_hash.h: + +/usr/include/c++/9.2.0/bits/basic_string.tcc: + +../../libde265/de265.h: + +../../libde265/de265-version.h: + +/usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/emmintrin.h: + +/usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/xmmintrin.h: + +/usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/mmintrin.h: + +/usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/mm_malloc.h: + +/usr/include/c++/9.2.0/stdlib.h: + +/usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/tmmintrin.h: + +/usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/pmmintrin.h: + +/usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/smmintrin.h: + +/usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/popcntintrin.h: diff --git a/x86/.deps/libde265_x86_sse_la-sse-motion.Plo b/x86/.deps/libde265_x86_sse_la-sse-motion.Plo new file mode 100644 index 0000000..492d1fd --- /dev/null +++ b/x86/.deps/libde265_x86_sse_la-sse-motion.Plo @@ -0,0 +1,432 @@ +libde265_x86_sse_la-sse-motion.lo: sse-motion.cc \ + /usr/include/stdc-predef.h ../../config.h /usr/include/stdio.h \ + /usr/include/bits/libc-header-start.h /usr/include/features.h \ + /usr/include/sys/cdefs.h /usr/include/bits/wordsize.h \ + /usr/include/bits/long-double.h /usr/include/gnu/stubs.h \ + /usr/include/gnu/stubs-64.h \ + /usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/stddef.h \ + /usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/stdarg.h \ + /usr/include/bits/types.h /usr/include/bits/timesize.h \ + /usr/include/bits/typesizes.h /usr/include/bits/time64.h \ + /usr/include/bits/types/__fpos_t.h /usr/include/bits/types/__mbstate_t.h \ + /usr/include/bits/types/__fpos64_t.h /usr/include/bits/types/__FILE.h \ + /usr/include/bits/types/FILE.h /usr/include/bits/types/struct_FILE.h \ + /usr/include/bits/types/cookie_io_functions_t.h \ + /usr/include/bits/stdio_lim.h /usr/include/bits/sys_errlist.h \ + /usr/include/bits/stdio.h \ + /usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/emmintrin.h \ + /usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/xmmintrin.h \ + /usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/mmintrin.h \ + /usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/mm_malloc.h \ + /usr/include/c++/9.2.0/stdlib.h /usr/include/c++/9.2.0/cstdlib \ + /usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/c++config.h \ + /usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/os_defines.h \ + /usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/cpu_defines.h \ + /usr/include/stdlib.h /usr/include/bits/waitflags.h \ + /usr/include/bits/waitstatus.h /usr/include/bits/floatn.h \ + /usr/include/bits/floatn-common.h /usr/include/bits/types/locale_t.h \ + /usr/include/bits/types/__locale_t.h /usr/include/sys/types.h \ + /usr/include/bits/types/clock_t.h /usr/include/bits/types/clockid_t.h \ + /usr/include/bits/types/time_t.h /usr/include/bits/types/timer_t.h \ + /usr/include/bits/stdint-intn.h /usr/include/endian.h \ + /usr/include/bits/endian.h /usr/include/bits/byteswap.h \ + /usr/include/bits/uintn-identity.h /usr/include/sys/select.h \ + /usr/include/bits/select.h /usr/include/bits/types/sigset_t.h \ + /usr/include/bits/types/__sigset_t.h \ + /usr/include/bits/types/struct_timeval.h \ + /usr/include/bits/types/struct_timespec.h \ + /usr/include/bits/pthreadtypes.h /usr/include/bits/thread-shared-types.h \ + /usr/include/bits/pthreadtypes-arch.h /usr/include/alloca.h \ + /usr/include/bits/stdlib-bsearch.h /usr/include/bits/stdlib-float.h \ + /usr/include/c++/9.2.0/bits/std_abs.h \ + /usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/tmmintrin.h \ + /usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/pmmintrin.h \ + /usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/smmintrin.h \ + /usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/popcntintrin.h \ + sse-motion.h /usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/stdint.h \ + /usr/include/stdint.h /usr/include/bits/wchar.h \ + /usr/include/bits/stdint-uintn.h ../../libde265/util.h \ + /usr/include/inttypes.h /usr/include/c++/9.2.0/string \ + /usr/include/c++/9.2.0/bits/stringfwd.h \ + /usr/include/c++/9.2.0/bits/memoryfwd.h \ + /usr/include/c++/9.2.0/bits/char_traits.h \ + /usr/include/c++/9.2.0/bits/stl_algobase.h \ + /usr/include/c++/9.2.0/bits/functexcept.h \ + /usr/include/c++/9.2.0/bits/exception_defines.h \ + /usr/include/c++/9.2.0/bits/cpp_type_traits.h \ + /usr/include/c++/9.2.0/ext/type_traits.h \ + /usr/include/c++/9.2.0/ext/numeric_traits.h \ + /usr/include/c++/9.2.0/bits/stl_pair.h \ + /usr/include/c++/9.2.0/bits/move.h \ + /usr/include/c++/9.2.0/bits/concept_check.h \ + /usr/include/c++/9.2.0/type_traits \ + /usr/include/c++/9.2.0/bits/stl_iterator_base_types.h \ + /usr/include/c++/9.2.0/bits/stl_iterator_base_funcs.h \ + /usr/include/c++/9.2.0/debug/assertions.h \ + /usr/include/c++/9.2.0/bits/stl_iterator.h \ + /usr/include/c++/9.2.0/bits/ptr_traits.h \ + /usr/include/c++/9.2.0/debug/debug.h \ + /usr/include/c++/9.2.0/bits/predefined_ops.h \ + /usr/include/c++/9.2.0/bits/postypes.h /usr/include/c++/9.2.0/cwchar \ + /usr/include/wchar.h /usr/include/bits/types/wint_t.h \ + /usr/include/bits/types/mbstate_t.h /usr/include/c++/9.2.0/cstdint \ + /usr/include/c++/9.2.0/bits/allocator.h \ + /usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/c++allocator.h \ + /usr/include/c++/9.2.0/ext/new_allocator.h /usr/include/c++/9.2.0/new \ + /usr/include/c++/9.2.0/exception /usr/include/c++/9.2.0/bits/exception.h \ + /usr/include/c++/9.2.0/bits/exception_ptr.h \ + /usr/include/c++/9.2.0/bits/cxxabi_init_exception.h \ + /usr/include/c++/9.2.0/typeinfo /usr/include/c++/9.2.0/bits/hash_bytes.h \ + /usr/include/c++/9.2.0/bits/nested_exception.h \ + /usr/include/c++/9.2.0/bits/localefwd.h \ + /usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/c++locale.h \ + /usr/include/c++/9.2.0/clocale /usr/include/locale.h \ + /usr/include/bits/locale.h /usr/include/c++/9.2.0/iosfwd \ + /usr/include/c++/9.2.0/cctype /usr/include/ctype.h \ + /usr/include/c++/9.2.0/bits/ostream_insert.h \ + /usr/include/c++/9.2.0/bits/cxxabi_forced.h \ + /usr/include/c++/9.2.0/bits/stl_function.h \ + /usr/include/c++/9.2.0/backward/binders.h \ + /usr/include/c++/9.2.0/bits/range_access.h \ + /usr/include/c++/9.2.0/initializer_list \ + /usr/include/c++/9.2.0/bits/basic_string.h \ + /usr/include/c++/9.2.0/ext/atomicity.h \ + /usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/gthr.h \ + /usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/gthr-default.h \ + /usr/include/pthread.h /usr/include/sched.h /usr/include/bits/sched.h \ + /usr/include/bits/types/struct_sched_param.h /usr/include/bits/cpu-set.h \ + /usr/include/time.h /usr/include/bits/time.h /usr/include/bits/timex.h \ + /usr/include/bits/types/struct_tm.h \ + /usr/include/bits/types/struct_itimerspec.h /usr/include/bits/setjmp.h \ + /usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/atomic_word.h \ + /usr/include/c++/9.2.0/ext/alloc_traits.h \ + /usr/include/c++/9.2.0/bits/alloc_traits.h \ + /usr/include/c++/9.2.0/ext/string_conversions.h \ + /usr/include/c++/9.2.0/cstdio /usr/include/c++/9.2.0/cerrno \ + /usr/include/errno.h /usr/include/bits/errno.h \ + /usr/include/linux/errno.h /usr/include/asm/errno.h \ + /usr/include/asm-generic/errno.h /usr/include/asm-generic/errno-base.h \ + /usr/include/bits/types/error_t.h \ + /usr/include/c++/9.2.0/bits/functional_hash.h \ + /usr/include/c++/9.2.0/bits/basic_string.tcc ../../libde265/de265.h \ + ../../libde265/de265-version.h + +/usr/include/stdc-predef.h: + +../../config.h: + +/usr/include/stdio.h: + +/usr/include/bits/libc-header-start.h: + +/usr/include/features.h: + +/usr/include/sys/cdefs.h: + +/usr/include/bits/wordsize.h: + +/usr/include/bits/long-double.h: + +/usr/include/gnu/stubs.h: + +/usr/include/gnu/stubs-64.h: + +/usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/stddef.h: + +/usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/stdarg.h: + +/usr/include/bits/types.h: + +/usr/include/bits/timesize.h: + +/usr/include/bits/typesizes.h: + +/usr/include/bits/time64.h: + +/usr/include/bits/types/__fpos_t.h: + +/usr/include/bits/types/__mbstate_t.h: + +/usr/include/bits/types/__fpos64_t.h: + +/usr/include/bits/types/__FILE.h: + +/usr/include/bits/types/FILE.h: + +/usr/include/bits/types/struct_FILE.h: + +/usr/include/bits/types/cookie_io_functions_t.h: + +/usr/include/bits/stdio_lim.h: + +/usr/include/bits/sys_errlist.h: + +/usr/include/bits/stdio.h: + +/usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/emmintrin.h: + +/usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/xmmintrin.h: + +/usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/mmintrin.h: + +/usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/mm_malloc.h: + +/usr/include/c++/9.2.0/stdlib.h: + +/usr/include/c++/9.2.0/cstdlib: + +/usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/c++config.h: + +/usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/os_defines.h: + +/usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/cpu_defines.h: + +/usr/include/stdlib.h: + +/usr/include/bits/waitflags.h: + +/usr/include/bits/waitstatus.h: + +/usr/include/bits/floatn.h: + +/usr/include/bits/floatn-common.h: + +/usr/include/bits/types/locale_t.h: + +/usr/include/bits/types/__locale_t.h: + +/usr/include/sys/types.h: + +/usr/include/bits/types/clock_t.h: + +/usr/include/bits/types/clockid_t.h: + +/usr/include/bits/types/time_t.h: + +/usr/include/bits/types/timer_t.h: + +/usr/include/bits/stdint-intn.h: + +/usr/include/endian.h: + +/usr/include/bits/endian.h: + +/usr/include/bits/byteswap.h: + +/usr/include/bits/uintn-identity.h: + +/usr/include/sys/select.h: + +/usr/include/bits/select.h: + +/usr/include/bits/types/sigset_t.h: + +/usr/include/bits/types/__sigset_t.h: + +/usr/include/bits/types/struct_timeval.h: + +/usr/include/bits/types/struct_timespec.h: + +/usr/include/bits/pthreadtypes.h: + +/usr/include/bits/thread-shared-types.h: + +/usr/include/bits/pthreadtypes-arch.h: + +/usr/include/alloca.h: + +/usr/include/bits/stdlib-bsearch.h: + +/usr/include/bits/stdlib-float.h: + +/usr/include/c++/9.2.0/bits/std_abs.h: + +/usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/tmmintrin.h: + +/usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/pmmintrin.h: + +/usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/smmintrin.h: + +/usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/popcntintrin.h: + +sse-motion.h: + +/usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/stdint.h: + +/usr/include/stdint.h: + +/usr/include/bits/wchar.h: + +/usr/include/bits/stdint-uintn.h: + +../../libde265/util.h: + +/usr/include/inttypes.h: + +/usr/include/c++/9.2.0/string: + +/usr/include/c++/9.2.0/bits/stringfwd.h: + +/usr/include/c++/9.2.0/bits/memoryfwd.h: + +/usr/include/c++/9.2.0/bits/char_traits.h: + +/usr/include/c++/9.2.0/bits/stl_algobase.h: + +/usr/include/c++/9.2.0/bits/functexcept.h: + +/usr/include/c++/9.2.0/bits/exception_defines.h: + +/usr/include/c++/9.2.0/bits/cpp_type_traits.h: + +/usr/include/c++/9.2.0/ext/type_traits.h: + +/usr/include/c++/9.2.0/ext/numeric_traits.h: + +/usr/include/c++/9.2.0/bits/stl_pair.h: + +/usr/include/c++/9.2.0/bits/move.h: + +/usr/include/c++/9.2.0/bits/concept_check.h: + +/usr/include/c++/9.2.0/type_traits: + +/usr/include/c++/9.2.0/bits/stl_iterator_base_types.h: + +/usr/include/c++/9.2.0/bits/stl_iterator_base_funcs.h: + +/usr/include/c++/9.2.0/debug/assertions.h: + +/usr/include/c++/9.2.0/bits/stl_iterator.h: + +/usr/include/c++/9.2.0/bits/ptr_traits.h: + +/usr/include/c++/9.2.0/debug/debug.h: + +/usr/include/c++/9.2.0/bits/predefined_ops.h: + +/usr/include/c++/9.2.0/bits/postypes.h: + +/usr/include/c++/9.2.0/cwchar: + +/usr/include/wchar.h: + +/usr/include/bits/types/wint_t.h: + +/usr/include/bits/types/mbstate_t.h: + +/usr/include/c++/9.2.0/cstdint: + +/usr/include/c++/9.2.0/bits/allocator.h: + +/usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/c++allocator.h: + +/usr/include/c++/9.2.0/ext/new_allocator.h: + +/usr/include/c++/9.2.0/new: + +/usr/include/c++/9.2.0/exception: + +/usr/include/c++/9.2.0/bits/exception.h: + +/usr/include/c++/9.2.0/bits/exception_ptr.h: + +/usr/include/c++/9.2.0/bits/cxxabi_init_exception.h: + +/usr/include/c++/9.2.0/typeinfo: + +/usr/include/c++/9.2.0/bits/hash_bytes.h: + +/usr/include/c++/9.2.0/bits/nested_exception.h: + +/usr/include/c++/9.2.0/bits/localefwd.h: + +/usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/c++locale.h: + +/usr/include/c++/9.2.0/clocale: + +/usr/include/locale.h: + +/usr/include/bits/locale.h: + +/usr/include/c++/9.2.0/iosfwd: + +/usr/include/c++/9.2.0/cctype: + +/usr/include/ctype.h: + +/usr/include/c++/9.2.0/bits/ostream_insert.h: + +/usr/include/c++/9.2.0/bits/cxxabi_forced.h: + +/usr/include/c++/9.2.0/bits/stl_function.h: + +/usr/include/c++/9.2.0/backward/binders.h: + +/usr/include/c++/9.2.0/bits/range_access.h: + +/usr/include/c++/9.2.0/initializer_list: + +/usr/include/c++/9.2.0/bits/basic_string.h: + +/usr/include/c++/9.2.0/ext/atomicity.h: + +/usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/gthr.h: + +/usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/gthr-default.h: + +/usr/include/pthread.h: + +/usr/include/sched.h: + +/usr/include/bits/sched.h: + +/usr/include/bits/types/struct_sched_param.h: + +/usr/include/bits/cpu-set.h: + +/usr/include/time.h: + +/usr/include/bits/time.h: + +/usr/include/bits/timex.h: + +/usr/include/bits/types/struct_tm.h: + +/usr/include/bits/types/struct_itimerspec.h: + +/usr/include/bits/setjmp.h: + +/usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/atomic_word.h: + +/usr/include/c++/9.2.0/ext/alloc_traits.h: + +/usr/include/c++/9.2.0/bits/alloc_traits.h: + +/usr/include/c++/9.2.0/ext/string_conversions.h: + +/usr/include/c++/9.2.0/cstdio: + +/usr/include/c++/9.2.0/cerrno: + +/usr/include/errno.h: + +/usr/include/bits/errno.h: + +/usr/include/linux/errno.h: + +/usr/include/asm/errno.h: + +/usr/include/asm-generic/errno.h: + +/usr/include/asm-generic/errno-base.h: + +/usr/include/bits/types/error_t.h: + +/usr/include/c++/9.2.0/bits/functional_hash.h: + +/usr/include/c++/9.2.0/bits/basic_string.tcc: + +../../libde265/de265.h: + +../../libde265/de265-version.h: diff --git a/x86/CMakeLists.txt b/x86/CMakeLists.txt new file mode 100644 index 0000000..0fd6fcf --- /dev/null +++ b/x86/CMakeLists.txt @@ -0,0 +1,23 @@ +set (x86_sources + sse.cc sse.h +) + +set (x86_sse_sources + sse-motion.cc sse-motion.h sse-dct.h sse-dct.cc +) + +add_library(x86 OBJECT ${x86_sources}) + +add_library(x86_sse OBJECT ${x86_sse_sources}) + +set(sse_flags "") + +if(NOT MSVC) + set(sse_flags "${sse_flags} -msse4.1") +endif() + +set(X86_OBJECTS $ $ PARENT_SCOPE) + +if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") + SET_TARGET_PROPERTIES(x86_sse PROPERTIES COMPILE_FLAGS "${sse_flags}") +endif(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") diff --git a/x86/Makefile b/x86/Makefile new file mode 100644 index 0000000..003f863 --- /dev/null +++ b/x86/Makefile @@ -0,0 +1,703 @@ +# Makefile.in generated by automake 1.16.1 from Makefile.am. +# libde265/x86/Makefile. Generated from Makefile.in by configure. + +# Copyright (C) 1994-2018 Free Software Foundation, Inc. + +# This Makefile.in is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY, to the extent permitted by law; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. + + + + +am__is_gnu_make = { \ + if test -z '$(MAKELEVEL)'; then \ + false; \ + elif test -n '$(MAKE_HOST)'; then \ + true; \ + elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \ + true; \ + else \ + false; \ + fi; \ +} +am__make_running_with_option = \ + case $${target_option-} in \ + ?) ;; \ + *) echo "am__make_running_with_option: internal error: invalid" \ + "target option '$${target_option-}' specified" >&2; \ + exit 1;; \ + esac; \ + has_opt=no; \ + sane_makeflags=$$MAKEFLAGS; \ + if $(am__is_gnu_make); then \ + sane_makeflags=$$MFLAGS; \ + else \ + case $$MAKEFLAGS in \ + *\\[\ \ ]*) \ + bs=\\; \ + sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \ + | sed "s/$$bs$$bs[$$bs $$bs ]*//g"`;; \ + esac; \ + fi; \ + skip_next=no; \ + strip_trailopt () \ + { \ + flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \ + }; \ + for flg in $$sane_makeflags; do \ + test $$skip_next = yes && { skip_next=no; continue; }; \ + case $$flg in \ + *=*|--*) continue;; \ + -*I) strip_trailopt 'I'; skip_next=yes;; \ + -*I?*) strip_trailopt 'I';; \ + -*O) strip_trailopt 'O'; skip_next=yes;; \ + -*O?*) strip_trailopt 'O';; \ + -*l) strip_trailopt 'l'; skip_next=yes;; \ + -*l?*) strip_trailopt 'l';; \ + -[dEDm]) skip_next=yes;; \ + -[JT]) skip_next=yes;; \ + esac; \ + case $$flg in \ + *$$target_option*) has_opt=yes; break;; \ + esac; \ + done; \ + test $$has_opt = yes +am__make_dryrun = (target_option=n; $(am__make_running_with_option)) +am__make_keepgoing = (target_option=k; $(am__make_running_with_option)) +pkgdatadir = $(datadir)/libde265 +pkgincludedir = $(includedir)/libde265 +pkglibdir = $(libdir)/libde265 +pkglibexecdir = $(libexecdir)/libde265 +am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd +install_sh_DATA = $(install_sh) -c -m 644 +install_sh_PROGRAM = $(install_sh) -c +install_sh_SCRIPT = $(install_sh) -c +INSTALL_HEADER = $(INSTALL_DATA) +transform = $(program_transform_name) +NORMAL_INSTALL = : +PRE_INSTALL = : +POST_INSTALL = : +NORMAL_UNINSTALL = : +PRE_UNINSTALL = : +POST_UNINSTALL = : +build_triplet = x86_64-pc-linux-gnu +host_triplet = x86_64-pc-linux-gnu +target_triplet = x86_64-pc-linux-gnu +#am__append_1 = -DHAVE_VISIBILITY +#am__append_2 = -DHAVE_VISIBILITY +subdir = libde265/x86 +ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 +am__aclocal_m4_deps = $(top_srcdir)/m4/ax_compare_version.m4 \ + $(top_srcdir)/m4/ax_cxx_compile_stdcxx_11.m4 \ + $(top_srcdir)/m4/libtool.m4 $(top_srcdir)/m4/ltoptions.m4 \ + $(top_srcdir)/m4/ltsugar.m4 $(top_srcdir)/m4/ltversion.m4 \ + $(top_srcdir)/m4/lt~obsolete.m4 \ + $(top_srcdir)/m4/m4_ax_check_compile_flag.m4 \ + $(top_srcdir)/configure.ac +am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ + $(ACLOCAL_M4) +DIST_COMMON = $(srcdir)/Makefile.am $(am__DIST_COMMON) +mkinstalldirs = $(install_sh) -d +CONFIG_HEADER = $(top_builddir)/config.h +CONFIG_CLEAN_FILES = +CONFIG_CLEAN_VPATH_FILES = +LTLIBRARIES = $(noinst_LTLIBRARIES) +libde265_x86_la_DEPENDENCIES = libde265_x86_sse.la +am_libde265_x86_la_OBJECTS = libde265_x86_la-sse.lo +libde265_x86_la_OBJECTS = $(am_libde265_x86_la_OBJECTS) +AM_V_lt = $(am__v_lt_$(V)) +am__v_lt_ = $(am__v_lt_$(AM_DEFAULT_VERBOSITY)) +am__v_lt_0 = --silent +am__v_lt_1 = +libde265_x86_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \ + $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \ + $(libde265_x86_la_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \ + $(LDFLAGS) -o $@ +libde265_x86_sse_la_LIBADD = +am_libde265_x86_sse_la_OBJECTS = libde265_x86_sse_la-sse-motion.lo \ + libde265_x86_sse_la-sse-dct.lo +libde265_x86_sse_la_OBJECTS = $(am_libde265_x86_sse_la_OBJECTS) +libde265_x86_sse_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \ + $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \ + $(libde265_x86_sse_la_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \ + $(LDFLAGS) -o $@ +AM_V_P = $(am__v_P_$(V)) +am__v_P_ = $(am__v_P_$(AM_DEFAULT_VERBOSITY)) +am__v_P_0 = false +am__v_P_1 = : +AM_V_GEN = $(am__v_GEN_$(V)) +am__v_GEN_ = $(am__v_GEN_$(AM_DEFAULT_VERBOSITY)) +am__v_GEN_0 = @echo " GEN " $@; +am__v_GEN_1 = +AM_V_at = $(am__v_at_$(V)) +am__v_at_ = $(am__v_at_$(AM_DEFAULT_VERBOSITY)) +am__v_at_0 = @ +am__v_at_1 = +DEFAULT_INCLUDES = -I. -I$(top_builddir) +depcomp = $(SHELL) $(top_srcdir)/depcomp +am__maybe_remake_depfiles = depfiles +am__depfiles_remade = ./$(DEPDIR)/libde265_x86_la-sse.Plo \ + ./$(DEPDIR)/libde265_x86_sse_la-sse-dct.Plo \ + ./$(DEPDIR)/libde265_x86_sse_la-sse-motion.Plo +am__mv = mv -f +CXXCOMPILE = $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \ + $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) +LTCXXCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) \ + $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \ + $(AM_CXXFLAGS) $(CXXFLAGS) +AM_V_CXX = $(am__v_CXX_$(V)) +am__v_CXX_ = $(am__v_CXX_$(AM_DEFAULT_VERBOSITY)) +am__v_CXX_0 = @echo " CXX " $@; +am__v_CXX_1 = +CXXLD = $(CXX) +CXXLINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=link $(CXXLD) $(AM_CXXFLAGS) \ + $(CXXFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@ +AM_V_CXXLD = $(am__v_CXXLD_$(V)) +am__v_CXXLD_ = $(am__v_CXXLD_$(AM_DEFAULT_VERBOSITY)) +am__v_CXXLD_0 = @echo " CXXLD " $@; +am__v_CXXLD_1 = +COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \ + $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) +LTCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) \ + $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \ + $(AM_CFLAGS) $(CFLAGS) +AM_V_CC = $(am__v_CC_$(V)) +am__v_CC_ = $(am__v_CC_$(AM_DEFAULT_VERBOSITY)) +am__v_CC_0 = @echo " CC " $@; +am__v_CC_1 = +CCLD = $(CC) +LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \ + $(AM_LDFLAGS) $(LDFLAGS) -o $@ +AM_V_CCLD = $(am__v_CCLD_$(V)) +am__v_CCLD_ = $(am__v_CCLD_$(AM_DEFAULT_VERBOSITY)) +am__v_CCLD_0 = @echo " CCLD " $@; +am__v_CCLD_1 = +SOURCES = $(libde265_x86_la_SOURCES) $(libde265_x86_sse_la_SOURCES) +DIST_SOURCES = $(libde265_x86_la_SOURCES) \ + $(libde265_x86_sse_la_SOURCES) +am__can_run_installinfo = \ + case $$AM_UPDATE_INFO_DIR in \ + n|no|NO) false;; \ + *) (install-info --version) >/dev/null 2>&1;; \ + esac +am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP) +# Read a list of newline-separated strings from the standard input, +# and print each of them once, without duplicates. Input order is +# *not* preserved. +am__uniquify_input = $(AWK) '\ + BEGIN { nonempty = 0; } \ + { items[$$0] = 1; nonempty = 1; } \ + END { if (nonempty) { for (i in items) print i; }; } \ +' +# Make sure the list of sources is unique. This is necessary because, +# e.g., the same source file might be shared among _SOURCES variables +# for different programs/libraries. +am__define_uniq_tagged_files = \ + list='$(am__tagged_files)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | $(am__uniquify_input)` +ETAGS = etags +CTAGS = ctags +am__DIST_COMMON = $(srcdir)/Makefile.in $(top_srcdir)/depcomp +DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) +ACLOCAL = ${SHELL} /home/dima/git/libde265/missing aclocal-1.16 +ALLOCA = +AMTAR = $${TAR-tar} +AM_DEFAULT_VERBOSITY = 1 +AR = ar +AUTOCONF = ${SHELL} /home/dima/git/libde265/missing autoconf +AUTOHEADER = ${SHELL} /home/dima/git/libde265/missing autoheader +AUTOMAKE = ${SHELL} /home/dima/git/libde265/missing automake-1.16 +AWK = gawk +CC = gcc +CCAS = gcc +CCASDEPMODE = depmode=gcc3 +CCASFLAGS = -g -O2 +CCDEPMODE = depmode=gcc3 +CFLAGS = -g -O2 -std=c99 -Wall +CPP = gcc -E +CPPFLAGS = +CXX = g++ +CXXCPP = g++ -E +CXXDEPMODE = depmode=gcc3 +CXXFLAGS = -g -O2 -Werror=return-type -Werror=unused-result -Werror=reorder -DDE265_LOG_ERROR +CYGPATH_W = echo +DEFS = -DHAVE_CONFIG_H +DEPDIR = .deps +DLLTOOL = false +DSYMUTIL = +DUMPBIN = +ECHO_C = +ECHO_N = -n +ECHO_T = +EGREP = /usr/bin/grep -E +EXEEXT = +FGREP = /usr/bin/grep -F +GREP = /usr/bin/grep +HAVE_CXX11 = +INSTALL = /usr/bin/install -c +INSTALL_DATA = ${INSTALL} -m 644 +INSTALL_PROGRAM = ${INSTALL} +INSTALL_SCRIPT = ${INSTALL} +INSTALL_STRIP_PROGRAM = $(install_sh) -c -s +LD = /usr/bin/ld -m elf_x86_64 +LDFLAGS = +LIBDE265_AGE = 0 +LIBDE265_CURRENT = 0 +LIBDE265_REVISION = 12 +LIBOBJS = +LIBS = -lpthread -lm +LIBTOOL = $(SHELL) $(top_builddir)/libtool +LIPO = +LN_S = ln -s +LTLIBOBJS = +LT_SYS_LIBRARY_PATH = +MAKEINFO = ${SHELL} /home/dima/git/libde265/missing makeinfo +MANIFEST_TOOL = : +MKDIR_P = /usr/bin/mkdir -p +NM = /usr/bin/nm -B +NMEDIT = +NUMERIC_VERSION = 0x01000500 +OBJDUMP = objdump +OBJEXT = o +OTOOL = +OTOOL64 = +PACKAGE = libde265 +PACKAGE_BUGREPORT = farin@struktur.de +PACKAGE_NAME = libde265 +PACKAGE_STRING = libde265 1.0.5 +PACKAGE_TARNAME = libde265 +PACKAGE_URL = +PACKAGE_VERSION = 1.0.5 +PATH_SEPARATOR = : +PKG_CONFIG = /usr/bin/pkg-config +PKG_CONFIG_LIBDIR = +PKG_CONFIG_PATH = +QTCHOOSER = +QTMOC = /usr/bin/moc-qt5 +QT_CFLAGS = -I/usr/include/qt/QtCore -I/usr/include/qt -I/usr/include/qt/QtGui -DQT_WIDGETS_LIB -I/usr/include/qt/QtWidgets -DQT_GUI_LIB -DQT_CORE_LIB +QT_LIBS = -lQt5Widgets -lQt5Gui -lQt5Core +RANLIB = ranlib +SDL_CFLAGS = -I/usr/include/SDL -D_GNU_SOURCE=1 -D_REENTRANT +SDL_LIBS = -lSDL -lpthread +SED = /usr/bin/sed +SET_MAKE = +SHELL = /bin/sh +STRIP = strip +SWSCALE_CFLAGS = +SWSCALE_LIBS = -lswscale +VERSION = 1.0.5 +VIDEOGFX_CFLAGS = +VIDEOGFX_LIBS = +abs_builddir = /home/dima/git/libde265/libde265/x86 +abs_srcdir = /home/dima/git/libde265/libde265/x86 +abs_top_builddir = /home/dima/git/libde265 +abs_top_srcdir = /home/dima/git/libde265 +ac_ct_AR = ar +ac_ct_CC = gcc +ac_ct_CXX = g++ +ac_ct_DUMPBIN = +am__include = include +am__leading_dot = . +am__quote = +am__tar = $${TAR-tar} chof - "$$tardir" +am__untar = $${TAR-tar} xf - +bindir = ${exec_prefix}/bin +build = x86_64-pc-linux-gnu +build_alias = +build_cpu = x86_64 +build_os = linux-gnu +build_vendor = pc +builddir = . +datadir = ${datarootdir} +datarootdir = ${prefix}/share +docdir = ${datarootdir}/doc/${PACKAGE_TARNAME} +dvidir = ${docdir} +exec_prefix = ${prefix} +host = x86_64-pc-linux-gnu +host_alias = +host_cpu = x86_64 +host_os = linux-gnu +host_vendor = pc +htmldir = ${docdir} +includedir = ${prefix}/include +infodir = ${datarootdir}/info +install_sh = ${SHELL} /home/dima/git/libde265/install-sh +libdir = ${exec_prefix}/lib +libexecdir = ${exec_prefix}/libexec +localedir = ${datarootdir}/locale +localstatedir = ${prefix}/var +mandir = ${datarootdir}/man +mkdir_p = $(MKDIR_P) +oldincludedir = /usr/include +pdfdir = ${docdir} +prefix = /usr/local +program_transform_name = s,x,x, +psdir = ${docdir} +sbindir = ${exec_prefix}/sbin +sharedstatedir = ${prefix}/com +srcdir = . +sysconfdir = ${prefix}/etc +target = x86_64-pc-linux-gnu +target_alias = +target_cpu = x86_64 +target_os = linux-gnu +target_vendor = pc +top_build_prefix = ../../ +top_builddir = ../.. +top_srcdir = ../.. +noinst_LTLIBRARIES = libde265_x86.la libde265_x86_sse.la +libde265_x86_la_CXXFLAGS = -I$(top_srcdir)/libde265 \ + $(CFLAG_VISIBILITY) $(am__append_1) +libde265_x86_la_SOURCES = sse.cc sse.h +libde265_x86_la_LIBADD = libde265_x86_sse.la + +# SSE4 specific functions +libde265_x86_sse_la_CXXFLAGS = -msse4.1 -I$(top_srcdir) \ + -I$(top_srcdir)/libde265 $(CFLAG_VISIBILITY) $(am__append_2) +libde265_x86_sse_la_SOURCES = sse-motion.cc sse-motion.h sse-dct.h sse-dct.cc +EXTRA_DIST = \ + CMakeLists.txt + +all: all-am + +.SUFFIXES: +.SUFFIXES: .cc .lo .o .obj +$(srcdir)/Makefile.in: $(srcdir)/Makefile.am $(am__configure_deps) + @for dep in $?; do \ + case '$(am__configure_deps)' in \ + *$$dep*) \ + ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \ + && { if test -f $@; then exit 0; else break; fi; }; \ + exit 1;; \ + esac; \ + done; \ + echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu libde265/x86/Makefile'; \ + $(am__cd) $(top_srcdir) && \ + $(AUTOMAKE) --gnu libde265/x86/Makefile +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + @case '$?' in \ + *config.status*) \ + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \ + *) \ + echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles)'; \ + cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles);; \ + esac; + +$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh + +$(top_srcdir)/configure: $(am__configure_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(ACLOCAL_M4): $(am__aclocal_m4_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(am__aclocal_m4_deps): + +clean-noinstLTLIBRARIES: + -test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES) + @list='$(noinst_LTLIBRARIES)'; \ + locs=`for p in $$list; do echo $$p; done | \ + sed 's|^[^/]*$$|.|; s|/[^/]*$$||; s|$$|/so_locations|' | \ + sort -u`; \ + test -z "$$locs" || { \ + echo rm -f $${locs}; \ + rm -f $${locs}; \ + } + +libde265_x86.la: $(libde265_x86_la_OBJECTS) $(libde265_x86_la_DEPENDENCIES) $(EXTRA_libde265_x86_la_DEPENDENCIES) + $(AM_V_CXXLD)$(libde265_x86_la_LINK) $(libde265_x86_la_OBJECTS) $(libde265_x86_la_LIBADD) $(LIBS) + +libde265_x86_sse.la: $(libde265_x86_sse_la_OBJECTS) $(libde265_x86_sse_la_DEPENDENCIES) $(EXTRA_libde265_x86_sse_la_DEPENDENCIES) + $(AM_V_CXXLD)$(libde265_x86_sse_la_LINK) $(libde265_x86_sse_la_OBJECTS) $(libde265_x86_sse_la_LIBADD) $(LIBS) + +mostlyclean-compile: + -rm -f *.$(OBJEXT) + +distclean-compile: + -rm -f *.tab.c + +include ./$(DEPDIR)/libde265_x86_la-sse.Plo # am--include-marker +include ./$(DEPDIR)/libde265_x86_sse_la-sse-dct.Plo # am--include-marker +include ./$(DEPDIR)/libde265_x86_sse_la-sse-motion.Plo # am--include-marker + +$(am__depfiles_remade): + @$(MKDIR_P) $(@D) + @echo '# dummy' >$@-t && $(am__mv) $@-t $@ + +am--depfiles: $(am__depfiles_remade) + +.cc.o: + $(AM_V_CXX)$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< + $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po +# $(AM_V_CXX)source='$<' object='$@' libtool=no \ +# DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) \ +# $(AM_V_CXX_no)$(CXXCOMPILE) -c -o $@ $< + +.cc.obj: + $(AM_V_CXX)$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'` + $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po +# $(AM_V_CXX)source='$<' object='$@' libtool=no \ +# DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) \ +# $(AM_V_CXX_no)$(CXXCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'` + +.cc.lo: + $(AM_V_CXX)$(LTCXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< + $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo +# $(AM_V_CXX)source='$<' object='$@' libtool=yes \ +# DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) \ +# $(AM_V_CXX_no)$(LTCXXCOMPILE) -c -o $@ $< + +libde265_x86_la-sse.lo: sse.cc + $(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_x86_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_x86_la-sse.lo -MD -MP -MF $(DEPDIR)/libde265_x86_la-sse.Tpo -c -o libde265_x86_la-sse.lo `test -f 'sse.cc' || echo '$(srcdir)/'`sse.cc + $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_x86_la-sse.Tpo $(DEPDIR)/libde265_x86_la-sse.Plo +# $(AM_V_CXX)source='sse.cc' object='libde265_x86_la-sse.lo' libtool=yes \ +# DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) \ +# $(AM_V_CXX_no)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_x86_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_x86_la-sse.lo `test -f 'sse.cc' || echo '$(srcdir)/'`sse.cc + +libde265_x86_sse_la-sse-motion.lo: sse-motion.cc + $(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_x86_sse_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_x86_sse_la-sse-motion.lo -MD -MP -MF $(DEPDIR)/libde265_x86_sse_la-sse-motion.Tpo -c -o libde265_x86_sse_la-sse-motion.lo `test -f 'sse-motion.cc' || echo '$(srcdir)/'`sse-motion.cc + $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_x86_sse_la-sse-motion.Tpo $(DEPDIR)/libde265_x86_sse_la-sse-motion.Plo +# $(AM_V_CXX)source='sse-motion.cc' object='libde265_x86_sse_la-sse-motion.lo' libtool=yes \ +# DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) \ +# $(AM_V_CXX_no)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_x86_sse_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_x86_sse_la-sse-motion.lo `test -f 'sse-motion.cc' || echo '$(srcdir)/'`sse-motion.cc + +libde265_x86_sse_la-sse-dct.lo: sse-dct.cc + $(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_x86_sse_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_x86_sse_la-sse-dct.lo -MD -MP -MF $(DEPDIR)/libde265_x86_sse_la-sse-dct.Tpo -c -o libde265_x86_sse_la-sse-dct.lo `test -f 'sse-dct.cc' || echo '$(srcdir)/'`sse-dct.cc + $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_x86_sse_la-sse-dct.Tpo $(DEPDIR)/libde265_x86_sse_la-sse-dct.Plo +# $(AM_V_CXX)source='sse-dct.cc' object='libde265_x86_sse_la-sse-dct.lo' libtool=yes \ +# DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) \ +# $(AM_V_CXX_no)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_x86_sse_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_x86_sse_la-sse-dct.lo `test -f 'sse-dct.cc' || echo '$(srcdir)/'`sse-dct.cc + +mostlyclean-libtool: + -rm -f *.lo + +clean-libtool: + -rm -rf .libs _libs + +ID: $(am__tagged_files) + $(am__define_uniq_tagged_files); mkid -fID $$unique +tags: tags-am +TAGS: tags + +tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files) + set x; \ + here=`pwd`; \ + $(am__define_uniq_tagged_files); \ + shift; \ + if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \ + test -n "$$unique" || unique=$$empty_fix; \ + if test $$# -gt 0; then \ + $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ + "$$@" $$unique; \ + else \ + $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ + $$unique; \ + fi; \ + fi +ctags: ctags-am + +CTAGS: ctags +ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files) + $(am__define_uniq_tagged_files); \ + test -z "$(CTAGS_ARGS)$$unique" \ + || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ + $$unique + +GTAGS: + here=`$(am__cd) $(top_builddir) && pwd` \ + && $(am__cd) $(top_srcdir) \ + && gtags -i $(GTAGS_ARGS) "$$here" +cscopelist: cscopelist-am + +cscopelist-am: $(am__tagged_files) + list='$(am__tagged_files)'; \ + case "$(srcdir)" in \ + [\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \ + *) sdir=$(subdir)/$(srcdir) ;; \ + esac; \ + for i in $$list; do \ + if test -f "$$i"; then \ + echo "$(subdir)/$$i"; \ + else \ + echo "$$sdir/$$i"; \ + fi; \ + done >> $(top_builddir)/cscope.files + +distclean-tags: + -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags + +distdir: $(BUILT_SOURCES) + $(MAKE) $(AM_MAKEFLAGS) distdir-am + +distdir-am: $(DISTFILES) + @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ + topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ + list='$(DISTFILES)'; \ + dist_files=`for file in $$list; do echo $$file; done | \ + sed -e "s|^$$srcdirstrip/||;t" \ + -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \ + case $$dist_files in \ + */*) $(MKDIR_P) `echo "$$dist_files" | \ + sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \ + sort -u` ;; \ + esac; \ + for file in $$dist_files; do \ + if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ + if test -d $$d/$$file; then \ + dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \ + if test -d "$(distdir)/$$file"; then \ + find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ + fi; \ + if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ + cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \ + find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ + fi; \ + cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \ + else \ + test -f "$(distdir)/$$file" \ + || cp -p $$d/$$file "$(distdir)/$$file" \ + || exit 1; \ + fi; \ + done +check-am: all-am +check: check-am +all-am: Makefile $(LTLIBRARIES) +installdirs: +install: install-am +install-exec: install-exec-am +install-data: install-data-am +uninstall: uninstall-am + +install-am: all-am + @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am + +installcheck: installcheck-am +install-strip: + if test -z '$(STRIP)'; then \ + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + install; \ + else \ + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \ + fi +mostlyclean-generic: + +clean-generic: + +distclean-generic: + -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) + -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES) + +maintainer-clean-generic: + @echo "This command is intended for maintainers to use" + @echo "it deletes files that may require special tools to rebuild." +clean: clean-am + +clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \ + mostlyclean-am + +distclean: distclean-am + -rm -f ./$(DEPDIR)/libde265_x86_la-sse.Plo + -rm -f ./$(DEPDIR)/libde265_x86_sse_la-sse-dct.Plo + -rm -f ./$(DEPDIR)/libde265_x86_sse_la-sse-motion.Plo + -rm -f Makefile +distclean-am: clean-am distclean-compile distclean-generic \ + distclean-tags + +dvi: dvi-am + +dvi-am: + +html: html-am + +html-am: + +info: info-am + +info-am: + +install-data-am: + +install-dvi: install-dvi-am + +install-dvi-am: + +install-exec-am: + +install-html: install-html-am + +install-html-am: + +install-info: install-info-am + +install-info-am: + +install-man: + +install-pdf: install-pdf-am + +install-pdf-am: + +install-ps: install-ps-am + +install-ps-am: + +installcheck-am: + +maintainer-clean: maintainer-clean-am + -rm -f ./$(DEPDIR)/libde265_x86_la-sse.Plo + -rm -f ./$(DEPDIR)/libde265_x86_sse_la-sse-dct.Plo + -rm -f ./$(DEPDIR)/libde265_x86_sse_la-sse-motion.Plo + -rm -f Makefile +maintainer-clean-am: distclean-am maintainer-clean-generic + +mostlyclean: mostlyclean-am + +mostlyclean-am: mostlyclean-compile mostlyclean-generic \ + mostlyclean-libtool + +pdf: pdf-am + +pdf-am: + +ps: ps-am + +ps-am: + +uninstall-am: + +.MAKE: install-am install-strip + +.PHONY: CTAGS GTAGS TAGS all all-am am--depfiles check check-am clean \ + clean-generic clean-libtool clean-noinstLTLIBRARIES \ + cscopelist-am ctags ctags-am distclean distclean-compile \ + distclean-generic distclean-libtool distclean-tags distdir dvi \ + dvi-am html html-am info info-am install install-am \ + install-data install-data-am install-dvi install-dvi-am \ + install-exec install-exec-am install-html install-html-am \ + install-info install-info-am install-man install-pdf \ + install-pdf-am install-ps install-ps-am install-strip \ + installcheck installcheck-am installdirs maintainer-clean \ + maintainer-clean-generic mostlyclean mostlyclean-compile \ + mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \ + tags tags-am uninstall uninstall-am + +.PRECIOUS: Makefile + + +# Tell versions [3.59,3.63) of GNU make to not export all variables. +# Otherwise a system limit (for SysV at least) may be exceeded. +.NOEXPORT: diff --git a/x86/Makefile.am b/x86/Makefile.am new file mode 100644 index 0000000..1f0a33a --- /dev/null +++ b/x86/Makefile.am @@ -0,0 +1,22 @@ +noinst_LTLIBRARIES = libde265_x86.la libde265_x86_sse.la + +libde265_x86_la_CXXFLAGS = -I$(top_srcdir)/libde265 $(CFLAG_VISIBILITY) +libde265_x86_la_SOURCES = sse.cc sse.h +libde265_x86_la_LIBADD = libde265_x86_sse.la + +if HAVE_VISIBILITY + libde265_x86_la_CXXFLAGS += -DHAVE_VISIBILITY +endif + + +# SSE4 specific functions + +libde265_x86_sse_la_CXXFLAGS = -msse4.1 -I$(top_srcdir) -I$(top_srcdir)/libde265 $(CFLAG_VISIBILITY) +libde265_x86_sse_la_SOURCES = sse-motion.cc sse-motion.h sse-dct.h sse-dct.cc + +if HAVE_VISIBILITY + libde265_x86_sse_la_CXXFLAGS += -DHAVE_VISIBILITY +endif + +EXTRA_DIST = \ + CMakeLists.txt diff --git a/x86/Makefile.in b/x86/Makefile.in new file mode 100644 index 0000000..eb494fe --- /dev/null +++ b/x86/Makefile.in @@ -0,0 +1,703 @@ +# Makefile.in generated by automake 1.16.1 from Makefile.am. +# @configure_input@ + +# Copyright (C) 1994-2018 Free Software Foundation, Inc. + +# This Makefile.in is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY, to the extent permitted by law; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. + +@SET_MAKE@ + +VPATH = @srcdir@ +am__is_gnu_make = { \ + if test -z '$(MAKELEVEL)'; then \ + false; \ + elif test -n '$(MAKE_HOST)'; then \ + true; \ + elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \ + true; \ + else \ + false; \ + fi; \ +} +am__make_running_with_option = \ + case $${target_option-} in \ + ?) ;; \ + *) echo "am__make_running_with_option: internal error: invalid" \ + "target option '$${target_option-}' specified" >&2; \ + exit 1;; \ + esac; \ + has_opt=no; \ + sane_makeflags=$$MAKEFLAGS; \ + if $(am__is_gnu_make); then \ + sane_makeflags=$$MFLAGS; \ + else \ + case $$MAKEFLAGS in \ + *\\[\ \ ]*) \ + bs=\\; \ + sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \ + | sed "s/$$bs$$bs[$$bs $$bs ]*//g"`;; \ + esac; \ + fi; \ + skip_next=no; \ + strip_trailopt () \ + { \ + flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \ + }; \ + for flg in $$sane_makeflags; do \ + test $$skip_next = yes && { skip_next=no; continue; }; \ + case $$flg in \ + *=*|--*) continue;; \ + -*I) strip_trailopt 'I'; skip_next=yes;; \ + -*I?*) strip_trailopt 'I';; \ + -*O) strip_trailopt 'O'; skip_next=yes;; \ + -*O?*) strip_trailopt 'O';; \ + -*l) strip_trailopt 'l'; skip_next=yes;; \ + -*l?*) strip_trailopt 'l';; \ + -[dEDm]) skip_next=yes;; \ + -[JT]) skip_next=yes;; \ + esac; \ + case $$flg in \ + *$$target_option*) has_opt=yes; break;; \ + esac; \ + done; \ + test $$has_opt = yes +am__make_dryrun = (target_option=n; $(am__make_running_with_option)) +am__make_keepgoing = (target_option=k; $(am__make_running_with_option)) +pkgdatadir = $(datadir)/@PACKAGE@ +pkgincludedir = $(includedir)/@PACKAGE@ +pkglibdir = $(libdir)/@PACKAGE@ +pkglibexecdir = $(libexecdir)/@PACKAGE@ +am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd +install_sh_DATA = $(install_sh) -c -m 644 +install_sh_PROGRAM = $(install_sh) -c +install_sh_SCRIPT = $(install_sh) -c +INSTALL_HEADER = $(INSTALL_DATA) +transform = $(program_transform_name) +NORMAL_INSTALL = : +PRE_INSTALL = : +POST_INSTALL = : +NORMAL_UNINSTALL = : +PRE_UNINSTALL = : +POST_UNINSTALL = : +build_triplet = @build@ +host_triplet = @host@ +target_triplet = @target@ +@HAVE_VISIBILITY_TRUE@am__append_1 = -DHAVE_VISIBILITY +@HAVE_VISIBILITY_TRUE@am__append_2 = -DHAVE_VISIBILITY +subdir = libde265/x86 +ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 +am__aclocal_m4_deps = $(top_srcdir)/m4/ax_compare_version.m4 \ + $(top_srcdir)/m4/ax_cxx_compile_stdcxx_11.m4 \ + $(top_srcdir)/m4/libtool.m4 $(top_srcdir)/m4/ltoptions.m4 \ + $(top_srcdir)/m4/ltsugar.m4 $(top_srcdir)/m4/ltversion.m4 \ + $(top_srcdir)/m4/lt~obsolete.m4 \ + $(top_srcdir)/m4/m4_ax_check_compile_flag.m4 \ + $(top_srcdir)/configure.ac +am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ + $(ACLOCAL_M4) +DIST_COMMON = $(srcdir)/Makefile.am $(am__DIST_COMMON) +mkinstalldirs = $(install_sh) -d +CONFIG_HEADER = $(top_builddir)/config.h +CONFIG_CLEAN_FILES = +CONFIG_CLEAN_VPATH_FILES = +LTLIBRARIES = $(noinst_LTLIBRARIES) +libde265_x86_la_DEPENDENCIES = libde265_x86_sse.la +am_libde265_x86_la_OBJECTS = libde265_x86_la-sse.lo +libde265_x86_la_OBJECTS = $(am_libde265_x86_la_OBJECTS) +AM_V_lt = $(am__v_lt_@AM_V@) +am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@) +am__v_lt_0 = --silent +am__v_lt_1 = +libde265_x86_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \ + $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \ + $(libde265_x86_la_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \ + $(LDFLAGS) -o $@ +libde265_x86_sse_la_LIBADD = +am_libde265_x86_sse_la_OBJECTS = libde265_x86_sse_la-sse-motion.lo \ + libde265_x86_sse_la-sse-dct.lo +libde265_x86_sse_la_OBJECTS = $(am_libde265_x86_sse_la_OBJECTS) +libde265_x86_sse_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \ + $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \ + $(libde265_x86_sse_la_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \ + $(LDFLAGS) -o $@ +AM_V_P = $(am__v_P_@AM_V@) +am__v_P_ = $(am__v_P_@AM_DEFAULT_V@) +am__v_P_0 = false +am__v_P_1 = : +AM_V_GEN = $(am__v_GEN_@AM_V@) +am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@) +am__v_GEN_0 = @echo " GEN " $@; +am__v_GEN_1 = +AM_V_at = $(am__v_at_@AM_V@) +am__v_at_ = $(am__v_at_@AM_DEFAULT_V@) +am__v_at_0 = @ +am__v_at_1 = +DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir) +depcomp = $(SHELL) $(top_srcdir)/depcomp +am__maybe_remake_depfiles = depfiles +am__depfiles_remade = ./$(DEPDIR)/libde265_x86_la-sse.Plo \ + ./$(DEPDIR)/libde265_x86_sse_la-sse-dct.Plo \ + ./$(DEPDIR)/libde265_x86_sse_la-sse-motion.Plo +am__mv = mv -f +CXXCOMPILE = $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \ + $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) +LTCXXCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) \ + $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \ + $(AM_CXXFLAGS) $(CXXFLAGS) +AM_V_CXX = $(am__v_CXX_@AM_V@) +am__v_CXX_ = $(am__v_CXX_@AM_DEFAULT_V@) +am__v_CXX_0 = @echo " CXX " $@; +am__v_CXX_1 = +CXXLD = $(CXX) +CXXLINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=link $(CXXLD) $(AM_CXXFLAGS) \ + $(CXXFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@ +AM_V_CXXLD = $(am__v_CXXLD_@AM_V@) +am__v_CXXLD_ = $(am__v_CXXLD_@AM_DEFAULT_V@) +am__v_CXXLD_0 = @echo " CXXLD " $@; +am__v_CXXLD_1 = +COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \ + $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) +LTCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) \ + $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \ + $(AM_CFLAGS) $(CFLAGS) +AM_V_CC = $(am__v_CC_@AM_V@) +am__v_CC_ = $(am__v_CC_@AM_DEFAULT_V@) +am__v_CC_0 = @echo " CC " $@; +am__v_CC_1 = +CCLD = $(CC) +LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \ + $(AM_LDFLAGS) $(LDFLAGS) -o $@ +AM_V_CCLD = $(am__v_CCLD_@AM_V@) +am__v_CCLD_ = $(am__v_CCLD_@AM_DEFAULT_V@) +am__v_CCLD_0 = @echo " CCLD " $@; +am__v_CCLD_1 = +SOURCES = $(libde265_x86_la_SOURCES) $(libde265_x86_sse_la_SOURCES) +DIST_SOURCES = $(libde265_x86_la_SOURCES) \ + $(libde265_x86_sse_la_SOURCES) +am__can_run_installinfo = \ + case $$AM_UPDATE_INFO_DIR in \ + n|no|NO) false;; \ + *) (install-info --version) >/dev/null 2>&1;; \ + esac +am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP) +# Read a list of newline-separated strings from the standard input, +# and print each of them once, without duplicates. Input order is +# *not* preserved. +am__uniquify_input = $(AWK) '\ + BEGIN { nonempty = 0; } \ + { items[$$0] = 1; nonempty = 1; } \ + END { if (nonempty) { for (i in items) print i; }; } \ +' +# Make sure the list of sources is unique. This is necessary because, +# e.g., the same source file might be shared among _SOURCES variables +# for different programs/libraries. +am__define_uniq_tagged_files = \ + list='$(am__tagged_files)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | $(am__uniquify_input)` +ETAGS = etags +CTAGS = ctags +am__DIST_COMMON = $(srcdir)/Makefile.in $(top_srcdir)/depcomp +DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) +ACLOCAL = @ACLOCAL@ +ALLOCA = @ALLOCA@ +AMTAR = @AMTAR@ +AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ +AR = @AR@ +AUTOCONF = @AUTOCONF@ +AUTOHEADER = @AUTOHEADER@ +AUTOMAKE = @AUTOMAKE@ +AWK = @AWK@ +CC = @CC@ +CCAS = @CCAS@ +CCASDEPMODE = @CCASDEPMODE@ +CCASFLAGS = @CCASFLAGS@ +CCDEPMODE = @CCDEPMODE@ +CFLAGS = @CFLAGS@ +CPP = @CPP@ +CPPFLAGS = @CPPFLAGS@ +CXX = @CXX@ +CXXCPP = @CXXCPP@ +CXXDEPMODE = @CXXDEPMODE@ +CXXFLAGS = @CXXFLAGS@ +CYGPATH_W = @CYGPATH_W@ +DEFS = @DEFS@ +DEPDIR = @DEPDIR@ +DLLTOOL = @DLLTOOL@ +DSYMUTIL = @DSYMUTIL@ +DUMPBIN = @DUMPBIN@ +ECHO_C = @ECHO_C@ +ECHO_N = @ECHO_N@ +ECHO_T = @ECHO_T@ +EGREP = @EGREP@ +EXEEXT = @EXEEXT@ +FGREP = @FGREP@ +GREP = @GREP@ +HAVE_CXX11 = @HAVE_CXX11@ +INSTALL = @INSTALL@ +INSTALL_DATA = @INSTALL_DATA@ +INSTALL_PROGRAM = @INSTALL_PROGRAM@ +INSTALL_SCRIPT = @INSTALL_SCRIPT@ +INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ +LD = @LD@ +LDFLAGS = @LDFLAGS@ +LIBDE265_AGE = @LIBDE265_AGE@ +LIBDE265_CURRENT = @LIBDE265_CURRENT@ +LIBDE265_REVISION = @LIBDE265_REVISION@ +LIBOBJS = @LIBOBJS@ +LIBS = @LIBS@ +LIBTOOL = @LIBTOOL@ +LIPO = @LIPO@ +LN_S = @LN_S@ +LTLIBOBJS = @LTLIBOBJS@ +LT_SYS_LIBRARY_PATH = @LT_SYS_LIBRARY_PATH@ +MAKEINFO = @MAKEINFO@ +MANIFEST_TOOL = @MANIFEST_TOOL@ +MKDIR_P = @MKDIR_P@ +NM = @NM@ +NMEDIT = @NMEDIT@ +NUMERIC_VERSION = @NUMERIC_VERSION@ +OBJDUMP = @OBJDUMP@ +OBJEXT = @OBJEXT@ +OTOOL = @OTOOL@ +OTOOL64 = @OTOOL64@ +PACKAGE = @PACKAGE@ +PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@ +PACKAGE_NAME = @PACKAGE_NAME@ +PACKAGE_STRING = @PACKAGE_STRING@ +PACKAGE_TARNAME = @PACKAGE_TARNAME@ +PACKAGE_URL = @PACKAGE_URL@ +PACKAGE_VERSION = @PACKAGE_VERSION@ +PATH_SEPARATOR = @PATH_SEPARATOR@ +PKG_CONFIG = @PKG_CONFIG@ +PKG_CONFIG_LIBDIR = @PKG_CONFIG_LIBDIR@ +PKG_CONFIG_PATH = @PKG_CONFIG_PATH@ +QTCHOOSER = @QTCHOOSER@ +QTMOC = @QTMOC@ +QT_CFLAGS = @QT_CFLAGS@ +QT_LIBS = @QT_LIBS@ +RANLIB = @RANLIB@ +SDL_CFLAGS = @SDL_CFLAGS@ +SDL_LIBS = @SDL_LIBS@ +SED = @SED@ +SET_MAKE = @SET_MAKE@ +SHELL = @SHELL@ +STRIP = @STRIP@ +SWSCALE_CFLAGS = @SWSCALE_CFLAGS@ +SWSCALE_LIBS = @SWSCALE_LIBS@ +VERSION = @VERSION@ +VIDEOGFX_CFLAGS = @VIDEOGFX_CFLAGS@ +VIDEOGFX_LIBS = @VIDEOGFX_LIBS@ +abs_builddir = @abs_builddir@ +abs_srcdir = @abs_srcdir@ +abs_top_builddir = @abs_top_builddir@ +abs_top_srcdir = @abs_top_srcdir@ +ac_ct_AR = @ac_ct_AR@ +ac_ct_CC = @ac_ct_CC@ +ac_ct_CXX = @ac_ct_CXX@ +ac_ct_DUMPBIN = @ac_ct_DUMPBIN@ +am__include = @am__include@ +am__leading_dot = @am__leading_dot@ +am__quote = @am__quote@ +am__tar = @am__tar@ +am__untar = @am__untar@ +bindir = @bindir@ +build = @build@ +build_alias = @build_alias@ +build_cpu = @build_cpu@ +build_os = @build_os@ +build_vendor = @build_vendor@ +builddir = @builddir@ +datadir = @datadir@ +datarootdir = @datarootdir@ +docdir = @docdir@ +dvidir = @dvidir@ +exec_prefix = @exec_prefix@ +host = @host@ +host_alias = @host_alias@ +host_cpu = @host_cpu@ +host_os = @host_os@ +host_vendor = @host_vendor@ +htmldir = @htmldir@ +includedir = @includedir@ +infodir = @infodir@ +install_sh = @install_sh@ +libdir = @libdir@ +libexecdir = @libexecdir@ +localedir = @localedir@ +localstatedir = @localstatedir@ +mandir = @mandir@ +mkdir_p = @mkdir_p@ +oldincludedir = @oldincludedir@ +pdfdir = @pdfdir@ +prefix = @prefix@ +program_transform_name = @program_transform_name@ +psdir = @psdir@ +sbindir = @sbindir@ +sharedstatedir = @sharedstatedir@ +srcdir = @srcdir@ +sysconfdir = @sysconfdir@ +target = @target@ +target_alias = @target_alias@ +target_cpu = @target_cpu@ +target_os = @target_os@ +target_vendor = @target_vendor@ +top_build_prefix = @top_build_prefix@ +top_builddir = @top_builddir@ +top_srcdir = @top_srcdir@ +noinst_LTLIBRARIES = libde265_x86.la libde265_x86_sse.la +libde265_x86_la_CXXFLAGS = -I$(top_srcdir)/libde265 \ + $(CFLAG_VISIBILITY) $(am__append_1) +libde265_x86_la_SOURCES = sse.cc sse.h +libde265_x86_la_LIBADD = libde265_x86_sse.la + +# SSE4 specific functions +libde265_x86_sse_la_CXXFLAGS = -msse4.1 -I$(top_srcdir) \ + -I$(top_srcdir)/libde265 $(CFLAG_VISIBILITY) $(am__append_2) +libde265_x86_sse_la_SOURCES = sse-motion.cc sse-motion.h sse-dct.h sse-dct.cc +EXTRA_DIST = \ + CMakeLists.txt + +all: all-am + +.SUFFIXES: +.SUFFIXES: .cc .lo .o .obj +$(srcdir)/Makefile.in: $(srcdir)/Makefile.am $(am__configure_deps) + @for dep in $?; do \ + case '$(am__configure_deps)' in \ + *$$dep*) \ + ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \ + && { if test -f $@; then exit 0; else break; fi; }; \ + exit 1;; \ + esac; \ + done; \ + echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu libde265/x86/Makefile'; \ + $(am__cd) $(top_srcdir) && \ + $(AUTOMAKE) --gnu libde265/x86/Makefile +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + @case '$?' in \ + *config.status*) \ + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \ + *) \ + echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles)'; \ + cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles);; \ + esac; + +$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh + +$(top_srcdir)/configure: $(am__configure_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(ACLOCAL_M4): $(am__aclocal_m4_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(am__aclocal_m4_deps): + +clean-noinstLTLIBRARIES: + -test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES) + @list='$(noinst_LTLIBRARIES)'; \ + locs=`for p in $$list; do echo $$p; done | \ + sed 's|^[^/]*$$|.|; s|/[^/]*$$||; s|$$|/so_locations|' | \ + sort -u`; \ + test -z "$$locs" || { \ + echo rm -f $${locs}; \ + rm -f $${locs}; \ + } + +libde265_x86.la: $(libde265_x86_la_OBJECTS) $(libde265_x86_la_DEPENDENCIES) $(EXTRA_libde265_x86_la_DEPENDENCIES) + $(AM_V_CXXLD)$(libde265_x86_la_LINK) $(libde265_x86_la_OBJECTS) $(libde265_x86_la_LIBADD) $(LIBS) + +libde265_x86_sse.la: $(libde265_x86_sse_la_OBJECTS) $(libde265_x86_sse_la_DEPENDENCIES) $(EXTRA_libde265_x86_sse_la_DEPENDENCIES) + $(AM_V_CXXLD)$(libde265_x86_sse_la_LINK) $(libde265_x86_sse_la_OBJECTS) $(libde265_x86_sse_la_LIBADD) $(LIBS) + +mostlyclean-compile: + -rm -f *.$(OBJEXT) + +distclean-compile: + -rm -f *.tab.c + +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libde265_x86_la-sse.Plo@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libde265_x86_sse_la-sse-dct.Plo@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libde265_x86_sse_la-sse-motion.Plo@am__quote@ # am--include-marker + +$(am__depfiles_remade): + @$(MKDIR_P) $(@D) + @echo '# dummy' >$@-t && $(am__mv) $@-t $@ + +am--depfiles: $(am__depfiles_remade) + +.cc.o: +@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< +@am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(CXXCOMPILE) -c -o $@ $< + +.cc.obj: +@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'` +@am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(CXXCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'` + +.cc.lo: +@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(LTCXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< +@am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(LTCXXCOMPILE) -c -o $@ $< + +libde265_x86_la-sse.lo: sse.cc +@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_x86_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_x86_la-sse.lo -MD -MP -MF $(DEPDIR)/libde265_x86_la-sse.Tpo -c -o libde265_x86_la-sse.lo `test -f 'sse.cc' || echo '$(srcdir)/'`sse.cc +@am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_x86_la-sse.Tpo $(DEPDIR)/libde265_x86_la-sse.Plo +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='sse.cc' object='libde265_x86_la-sse.lo' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_x86_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_x86_la-sse.lo `test -f 'sse.cc' || echo '$(srcdir)/'`sse.cc + +libde265_x86_sse_la-sse-motion.lo: sse-motion.cc +@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_x86_sse_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_x86_sse_la-sse-motion.lo -MD -MP -MF $(DEPDIR)/libde265_x86_sse_la-sse-motion.Tpo -c -o libde265_x86_sse_la-sse-motion.lo `test -f 'sse-motion.cc' || echo '$(srcdir)/'`sse-motion.cc +@am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_x86_sse_la-sse-motion.Tpo $(DEPDIR)/libde265_x86_sse_la-sse-motion.Plo +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='sse-motion.cc' object='libde265_x86_sse_la-sse-motion.lo' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_x86_sse_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_x86_sse_la-sse-motion.lo `test -f 'sse-motion.cc' || echo '$(srcdir)/'`sse-motion.cc + +libde265_x86_sse_la-sse-dct.lo: sse-dct.cc +@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_x86_sse_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_x86_sse_la-sse-dct.lo -MD -MP -MF $(DEPDIR)/libde265_x86_sse_la-sse-dct.Tpo -c -o libde265_x86_sse_la-sse-dct.lo `test -f 'sse-dct.cc' || echo '$(srcdir)/'`sse-dct.cc +@am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_x86_sse_la-sse-dct.Tpo $(DEPDIR)/libde265_x86_sse_la-sse-dct.Plo +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='sse-dct.cc' object='libde265_x86_sse_la-sse-dct.lo' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_x86_sse_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_x86_sse_la-sse-dct.lo `test -f 'sse-dct.cc' || echo '$(srcdir)/'`sse-dct.cc + +mostlyclean-libtool: + -rm -f *.lo + +clean-libtool: + -rm -rf .libs _libs + +ID: $(am__tagged_files) + $(am__define_uniq_tagged_files); mkid -fID $$unique +tags: tags-am +TAGS: tags + +tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files) + set x; \ + here=`pwd`; \ + $(am__define_uniq_tagged_files); \ + shift; \ + if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \ + test -n "$$unique" || unique=$$empty_fix; \ + if test $$# -gt 0; then \ + $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ + "$$@" $$unique; \ + else \ + $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ + $$unique; \ + fi; \ + fi +ctags: ctags-am + +CTAGS: ctags +ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files) + $(am__define_uniq_tagged_files); \ + test -z "$(CTAGS_ARGS)$$unique" \ + || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ + $$unique + +GTAGS: + here=`$(am__cd) $(top_builddir) && pwd` \ + && $(am__cd) $(top_srcdir) \ + && gtags -i $(GTAGS_ARGS) "$$here" +cscopelist: cscopelist-am + +cscopelist-am: $(am__tagged_files) + list='$(am__tagged_files)'; \ + case "$(srcdir)" in \ + [\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \ + *) sdir=$(subdir)/$(srcdir) ;; \ + esac; \ + for i in $$list; do \ + if test -f "$$i"; then \ + echo "$(subdir)/$$i"; \ + else \ + echo "$$sdir/$$i"; \ + fi; \ + done >> $(top_builddir)/cscope.files + +distclean-tags: + -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags + +distdir: $(BUILT_SOURCES) + $(MAKE) $(AM_MAKEFLAGS) distdir-am + +distdir-am: $(DISTFILES) + @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ + topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ + list='$(DISTFILES)'; \ + dist_files=`for file in $$list; do echo $$file; done | \ + sed -e "s|^$$srcdirstrip/||;t" \ + -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \ + case $$dist_files in \ + */*) $(MKDIR_P) `echo "$$dist_files" | \ + sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \ + sort -u` ;; \ + esac; \ + for file in $$dist_files; do \ + if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ + if test -d $$d/$$file; then \ + dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \ + if test -d "$(distdir)/$$file"; then \ + find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ + fi; \ + if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ + cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \ + find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ + fi; \ + cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \ + else \ + test -f "$(distdir)/$$file" \ + || cp -p $$d/$$file "$(distdir)/$$file" \ + || exit 1; \ + fi; \ + done +check-am: all-am +check: check-am +all-am: Makefile $(LTLIBRARIES) +installdirs: +install: install-am +install-exec: install-exec-am +install-data: install-data-am +uninstall: uninstall-am + +install-am: all-am + @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am + +installcheck: installcheck-am +install-strip: + if test -z '$(STRIP)'; then \ + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + install; \ + else \ + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \ + fi +mostlyclean-generic: + +clean-generic: + +distclean-generic: + -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) + -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES) + +maintainer-clean-generic: + @echo "This command is intended for maintainers to use" + @echo "it deletes files that may require special tools to rebuild." +clean: clean-am + +clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \ + mostlyclean-am + +distclean: distclean-am + -rm -f ./$(DEPDIR)/libde265_x86_la-sse.Plo + -rm -f ./$(DEPDIR)/libde265_x86_sse_la-sse-dct.Plo + -rm -f ./$(DEPDIR)/libde265_x86_sse_la-sse-motion.Plo + -rm -f Makefile +distclean-am: clean-am distclean-compile distclean-generic \ + distclean-tags + +dvi: dvi-am + +dvi-am: + +html: html-am + +html-am: + +info: info-am + +info-am: + +install-data-am: + +install-dvi: install-dvi-am + +install-dvi-am: + +install-exec-am: + +install-html: install-html-am + +install-html-am: + +install-info: install-info-am + +install-info-am: + +install-man: + +install-pdf: install-pdf-am + +install-pdf-am: + +install-ps: install-ps-am + +install-ps-am: + +installcheck-am: + +maintainer-clean: maintainer-clean-am + -rm -f ./$(DEPDIR)/libde265_x86_la-sse.Plo + -rm -f ./$(DEPDIR)/libde265_x86_sse_la-sse-dct.Plo + -rm -f ./$(DEPDIR)/libde265_x86_sse_la-sse-motion.Plo + -rm -f Makefile +maintainer-clean-am: distclean-am maintainer-clean-generic + +mostlyclean: mostlyclean-am + +mostlyclean-am: mostlyclean-compile mostlyclean-generic \ + mostlyclean-libtool + +pdf: pdf-am + +pdf-am: + +ps: ps-am + +ps-am: + +uninstall-am: + +.MAKE: install-am install-strip + +.PHONY: CTAGS GTAGS TAGS all all-am am--depfiles check check-am clean \ + clean-generic clean-libtool clean-noinstLTLIBRARIES \ + cscopelist-am ctags ctags-am distclean distclean-compile \ + distclean-generic distclean-libtool distclean-tags distdir dvi \ + dvi-am html html-am info info-am install install-am \ + install-data install-data-am install-dvi install-dvi-am \ + install-exec install-exec-am install-html install-html-am \ + install-info install-info-am install-man install-pdf \ + install-pdf-am install-ps install-ps-am install-strip \ + installcheck installcheck-am installdirs maintainer-clean \ + maintainer-clean-generic mostlyclean mostlyclean-compile \ + mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \ + tags tags-am uninstall uninstall-am + +.PRECIOUS: Makefile + + +# Tell versions [3.59,3.63) of GNU make to not export all variables. +# Otherwise a system limit (for SysV at least) may be exceeded. +.NOEXPORT: diff --git a/x86/sse-dct.cc b/x86/sse-dct.cc new file mode 100644 index 0000000..3a9b7ba --- /dev/null +++ b/x86/sse-dct.cc @@ -0,0 +1,7094 @@ +/* + * H.265 video codec. + * Copyright (c) 2013 openHEVC contributors + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "x86/sse-dct.h" +#include "libde265/util.h" + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include // SSE2 +#include // SSSE3 + +#if HAVE_SSE4_1 +#include // SSE4.1 +#endif + + +ALIGNED_16(static const int16_t) transform4x4_luma[8][8] = +{ + { 29, +84, 29, +84, 29, +84, 29, +84 }, + { +74, +55, +74, +55, +74, +55, +74, +55 }, + { 55, -29, 55, -29, 55, -29, 55, -29 }, + { +74, -84, +74, -84, +74, -84, +74, -84 }, + { 74, -74, 74, -74, 74, -74, 74, -74 }, + { 0, +74, 0, +74, 0, +74, 0, +74 }, + { 84, +55, 84, +55, 84, +55, 84, +55 }, + { -74, -29, -74, -29, -74, -29, -74, -29 } +}; + +ALIGNED_16(static const int16_t) transform4x4[4][8] = { + { 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, -64, 64, -64, 64, -64, 64, -64 }, + { 83, 36, 83, 36, 83, 36, 83, 36 }, + { 36, -83, 36, -83, 36, -83, 36, -83 } +}; + +ALIGNED_16(static const int16_t) transform8x8[12][8] = +{ + { 89, 75, 89, 75, 89, 75, 89, 75 }, + { 50, 18, 50, 18, 50, 18, 50, 18 }, + { 75, -18, 75, -18, 75, -18, 75, -18 }, + { -89, -50, -89, -50,-89, -50,-89, -50 }, + { 50, -89, 50, -89, 50, -89, 50, -89 }, + { 18, 75, 18, 75, 18, 75, 18, 75 }, + { 18, -50, 18, -50, 18, -50, 18, -50 }, + { 75, -89, 75, -89, 75, -89, 75, -89 }, + { 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, -64, 64, -64, 64, -64, 64, -64 }, + { 83, 36, 83, 36, 83, 36, 83, 36 }, + { 36, -83, 36, -83, 36, -83, 36, -83 } +}; + +ALIGNED_16(static const int16_t) transform16x16_1[4][8][8] = +{ + {/*1-3*/ /*2-6*/ + { 90, 87, 90, 87, 90, 87, 90, 87 }, + { 87, 57, 87, 57, 87, 57, 87, 57 }, + { 80, 9, 80, 9, 80, 9, 80, 9 }, + { 70, -43, 70, -43, 70, -43, 70, -43 }, + { 57, -80, 57, -80, 57, -80, 57, -80 }, + { 43, -90, 43, -90, 43, -90, 43, -90 }, + { 25, -70, 25, -70, 25, -70, 25, -70 }, + { 9, -25, 9, -25, 9, -25, 9, -25 }, + },{ /*5-7*/ /*10-14*/ + { 80, 70, 80, 70, 80, 70, 80, 70 }, + { 9, -43, 9, -43, 9, -43, 9, -43 }, + { -70, -87, -70, -87, -70, -87, -70, -87 }, + { -87, 9, -87, 9, -87, 9, -87, 9 }, + { -25, 90, -25, 90, -25, 90, -25, 90 }, + { 57, 25, 57, 25, 57, 25, 57, 25 }, + { 90, -80, 90, -80, 90, -80, 90, -80 }, + { 43, -57, 43, -57, 43, -57, 43, -57 }, + },{ /*9-11*/ /*18-22*/ + { 57, 43, 57, 43, 57, 43, 57, 43 }, + { -80, -90, -80, -90, -80, -90, -80, -90 }, + { -25, 57, -25, 57, -25, 57, -25, 57 }, + { 90, 25, 90, 25, 90, 25, 90, 25 }, + { -9, -87, -9, -87, -9, -87, -9, -87 }, + { -87, 70, -87, 70, -87, 70, -87, 70 }, + { 43, 9, 43, 9, 43, 9, 43, 9 }, + { 70, -80, 70, -80, 70, -80, 70, -80 }, + },{/*13-15*/ /* 26-30 */ + { 25, 9, 25, 9, 25, 9, 25, 9 }, + { -70, -25, -70, -25, -70, -25, -70, -25 }, + { 90, 43, 90, 43, 90, 43, 90, 43 }, + { -80, -57, -80, -57, -80, -57, -80, -57 }, + { 43, 70, 43, 70, 43, 70, 43, 70 }, + { 9, -80, 9, -80, 9, -80, 9, -80 }, + { -57, 87, -57, 87, -57, 87, -57, 87 }, + { 87, -90, 87, -90, 87, -90, 87, -90 }, + } +}; + +ALIGNED_16(static const int16_t) transform16x16_2[2][4][8] = +{ + { /*2-6*/ /*4-12*/ + { 89, 75, 89, 75, 89, 75, 89, 75 }, + { 75, -18, 75, -18, 75, -18, 75, -18 }, + { 50, -89, 50, -89, 50, -89, 50, -89 }, + { 18, -50, 18, -50, 18, -50, 18, -50 }, + },{ /*10-14*/ /*20-28*/ + { 50, 18, 50, 18, 50, 18, 50, 18 }, + { -89, -50, -89, -50, -89, -50, -89, -50 }, + { 18, 75, 18, 75, 18, 75, 18, 75 }, + { 75, -89, 75, -89, 75, -89, 75, -89 }, + } +}; + +ALIGNED_16(static const int16_t) transform16x16_3[2][2][8] = +{ + {/*4-12*/ /*8-24*/ + { 83, 36, 83, 36, 83, 36, 83, 36 }, + { 36, -83, 36, -83, 36, -83, 36, -83 }, + },{ /*0-8*/ /*0-16*/ + { 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, -64, 64, -64, 64, -64, 64, -64 }, + } +}; + + +ALIGNED_16(static const int16_t) transform32x32[8][16][8] = +{ + { /* 1-3 */ + { 90, 90, 90, 90, 90, 90, 90, 90 }, + { 90, 82, 90, 82, 90, 82, 90, 82 }, + { 88, 67, 88, 67, 88, 67, 88, 67 }, + { 85, 46, 85, 46, 85, 46, 85, 46 }, + { 82, 22, 82, 22, 82, 22, 82, 22 }, + { 78, -4, 78, -4, 78, -4, 78, -4 }, + { 73, -31, 73, -31, 73, -31, 73, -31 }, + { 67, -54, 67, -54, 67, -54, 67, -54 }, + { 61, -73, 61, -73, 61, -73, 61, -73 }, + { 54, -85, 54, -85, 54, -85, 54, -85 }, + { 46, -90, 46, -90, 46, -90, 46, -90 }, + { 38, -88, 38, -88, 38, -88, 38, -88 }, + { 31, -78, 31, -78, 31, -78, 31, -78 }, + { 22, -61, 22, -61, 22, -61, 22, -61 }, + { 13, -38, 13, -38, 13, -38, 13, -38 }, + { 4, -13, 4, -13, 4, -13, 4, -13 }, + },{/* 5-7 */ + { 88, 85, 88, 85, 88, 85, 88, 85 }, + { 67, 46, 67, 46, 67, 46, 67, 46 }, + { 31, -13, 31, -13, 31, -13, 31, -13 }, + { -13, -67, -13, -67, -13, -67, -13, -67 }, + { -54, -90, -54, -90, -54, -90, -54, -90 }, + { -82, -73, -82, -73, -82, -73, -82, -73 }, + { -90, -22, -90, -22, -90, -22, -90, -22 }, + { -78, 38, -78, 38, -78, 38, -78, 38 }, + { -46, 82, -46, 82, -46, 82, -46, 82 }, + { -4, 88, -4, 88, -4, 88, -4, 88 }, + { 38, 54, 38, 54, 38, 54, 38, 54 }, + { 73, -4, 73, -4, 73, -4, 73, -4 }, + { 90, -61, 90, -61, 90, -61, 90, -61 }, + { 85, -90, 85, -90, 85, -90, 85, -90 }, + { 61, -78, 61, -78, 61, -78, 61, -78 }, + { 22, -31, 22, -31, 22, -31, 22, -31 }, + },{/* 9-11 */ + { 82, 78, 82, 78, 82, 78, 82, 78 }, + { 22, -4, 22, -4, 22, -4, 22, -4 }, + { -54, -82, -54, -82, -54, -82, -54, -82 }, + { -90, -73, -90, -73, -90, -73, -90, -73 }, + { -61, 13, -61, 13, -61, 13, -61, 13 }, + { 13, 85, 13, 85, 13, 85, 13, 85 }, + { 78, 67, 78, 67, 78, 67, 78, 67 }, + { 85, -22, 85, -22, 85, -22, 85, -22 }, + { 31, -88, 31, -88, 31, -88, 31, -88 }, + { -46, -61, -46, -61, -46, -61, -46, -61 }, + { -90, 31, -90, 31, -90, 31, -90, 31 }, + { -67, 90, -67, 90, -67, 90, -67, 90 }, + { 4, 54, 4, 54, 4, 54, 4, 54 }, + { 73, -38, 73, -38, 73, -38, 73, -38 }, + { 88, -90, 88, -90, 88, -90, 88, -90 }, + { 38, -46, 38, -46, 38, -46, 38, -46 }, + },{/* 13-15 */ + { 73, 67, 73, 67, 73, 67, 73, 67 }, + { -31, -54, -31, -54, -31, -54, -31, -54 }, + { -90, -78, -90, -78, -90, -78, -90, -78 }, + { -22, 38, -22, 38, -22, 38, -22, 38 }, + { 78, 85, 78, 85, 78, 85, 78, 85 }, + { 67, -22, 67, -22, 67, -22, 67, -22 }, + { -38, -90, -38, -90, -38, -90, -38, -90 }, + { -90, 4, -90, 4, -90, 4, -90, 4 }, + { -13, 90, -13, 90, -13, 90, -13, 90 }, + { 82, 13, 82, 13, 82, 13, 82, 13 }, + { 61, -88, 61, -88, 61, -88, 61, -88 }, + { -46, -31, -46, -31, -46, -31, -46, -31 }, + { -88, 82, -88, 82, -88, 82, -88, 82 }, + { -4, 46, -4, 46, -4, 46, -4, 46 }, + { 85, -73, 85, -73, 85, -73, 85, -73 }, + { 54, -61, 54, -61, 54, -61, 54, -61 }, + },{/* 17-19 */ + { 61, 54, 61, 54, 61, 54, 61, 54 }, + { -73, -85, -73, -85, -73, -85, -73, -85 }, + { -46, -4, -46, -4, -46, -4, -46, -4 }, + { 82, 88, 82, 88, 82, 88, 82, 88 }, + { 31, -46, 31, -46, 31, -46, 31, -46 }, + { -88, -61, -88, -61, -88, -61, -88, -61 }, + { -13, 82, -13, 82, -13, 82, -13, 82 }, + { 90, 13, 90, 13, 90, 13, 90, 13 }, + { -4, -90, -4, -90, -4, -90, -4, -90 }, + { -90, 38, -90, 38, -90, 38, -90, 38 }, + { 22, 67, 22, 67, 22, 67, 22, 67 }, + { 85, -78, 85, -78, 85, -78, 85, -78 }, + { -38, -22, -38, -22, -38, -22, -38, -22 }, + { -78, 90, -78, 90, -78, 90, -78, 90 }, + { 54, -31, 54, -31, 54, -31, 54, -31 }, + { 67, -73, 67, -73, 67, -73, 67, -73 }, + },{ /* 21-23 */ + { 46, 38, 46, 38, 46, 38, 46, 38 }, + { -90, -88, -90, -88, -90, -88, -90, -88 }, + { 38, 73, 38, 73, 38, 73, 38, 73 }, + { 54, -4, 54, -4, 54, -4, 54, -4 }, + { -90, -67, -90, -67, -90, -67, -90, -67 }, + { 31, 90, 31, 90, 31, 90, 31, 90 }, + { 61, -46, 61, -46, 61, -46, 61, -46 }, + { -88, -31, -88, -31, -88, -31, -88, -31 }, + { 22, 85, 22, 85, 22, 85, 22, 85 }, + { 67, -78, 67, -78, 67, -78, 67, -78 }, + { -85, 13, -85, 13, -85, 13, -85, 13 }, + { 13, 61, 13, 61, 13, 61, 13, 61 }, + { 73, -90, 73, -90, 73, -90, 73, -90 }, + { -82, 54, -82, 54, -82, 54, -82, 54 }, + { 4, 22, 4, 22, 4, 22, 4, 22 }, + { 78, -82, 78, -82, 78, -82, 78, -82 }, + },{ /* 25-27 */ + { 31, 22, 31, 22, 31, 22, 31, 22 }, + { -78, -61, -78, -61, -78, -61, -78, -61 }, + { 90, 85, 90, 85, 90, 85, 90, 85 }, + { -61, -90, -61, -90, -61, -90, -61, -90 }, + { 4, 73, 4, 73, 4, 73, 4, 73 }, + { 54, -38, 54, -38, 54, -38, 54, -38 }, + { -88, -4, -88, -4, -88, -4, -88, -4 }, + { 82, 46, 82, 46, 82, 46, 82, 46 }, + { -38, -78, -38, -78, -38, -78, -38, -78 }, + { -22, 90, -22, 90, -22, 90, -22, 90 }, + { 73, -82, 73, -82, 73, -82, 73, -82 }, + { -90, 54, -90, 54, -90, 54, -90, 54 }, + { 67, -13, 67, -13, 67, -13, 67, -13 }, + { -13, -31, -13, -31, -13, -31, -13, -31 }, + { -46, 67, -46, 67, -46, 67, -46, 67 }, + { 85, -88, 85, -88, 85, -88, 85, -88 }, + },{/* 29-31 */ + { 13, 4, 13, 4, 13, 4, 13, 4 }, + { -38, -13, -38, -13, -38, -13, -38, -13 }, + { 61, 22, 61, 22, 61, 22, 61, 22 }, + { -78, -31, -78, -31, -78, -31, -78, -31 }, + { 88, 38, 88, 38, 88, 38, 88, 38 }, + { -90, -46, -90, -46, -90, -46, -90, -46 }, + { 85, 54, 85, 54, 85, 54, 85, 54 }, + { -73, -61, -73, -61, -73, -61, -73, -61 }, + { 54, 67, 54, 67, 54, 67, 54, 67 }, + { -31, -73, -31, -73, -31, -73, -31, -73 }, + { 4, 78, 4, 78, 4, 78, 4, 78 }, + { 22, -82, 22, -82, 22, -82, 22, -82 }, + { -46, 85, -46, 85, -46, 85, -46, 85 }, + { 67, -88, 67, -88, 67, -88, 67, -88 }, + { -82, 90, -82, 90, -82, 90, -82, 90 }, + { 90, -90, 90, -90, 90, -90, 90, -90 }, + } +}; + +#define shift_1st 7 +#define add_1st (1 << (shift_1st - 1)) + + +void ff_hevc_transform_skip_8_sse(uint8_t *_dst, const int16_t *coeffs, ptrdiff_t _stride) +{ + uint8_t *dst = (uint8_t*)_dst; + ptrdiff_t stride = _stride; + int shift = 5; + int offset = 16; + __m128i r0,r1,r2,r3,r4,r5,r6,r9; + + r9= _mm_setzero_si128(); + //r8= _mm_set_epi32(0,0,0,-1); + r2= _mm_set1_epi16(offset); + + r0= _mm_load_si128((__m128i*)(coeffs)); + r1= _mm_load_si128((__m128i*)(coeffs+8)); + + + r0= _mm_adds_epi16(r0,r2); + r1= _mm_adds_epi16(r1,r2); + + r0= _mm_srai_epi16(r0,shift); + r1= _mm_srai_epi16(r1,shift); + + r3= _mm_loadl_epi64((__m128i*)(dst)); + r4= _mm_loadl_epi64((__m128i*)(dst + stride)); + r5= _mm_loadl_epi64((__m128i*)(dst + 2*stride)); + r6= _mm_loadl_epi64((__m128i*)(dst + 3*stride)); + + r3= _mm_unpacklo_epi8(r3,r9); + r4= _mm_unpacklo_epi8(r4,r9); + r5= _mm_unpacklo_epi8(r5,r9); + r6= _mm_unpacklo_epi8(r6,r9); + r3= _mm_unpacklo_epi64(r3,r4); + r4= _mm_unpacklo_epi64(r5,r6); + + + r3= _mm_adds_epi16(r3,r0); + r4= _mm_adds_epi16(r4,r1); + + r3= _mm_packus_epi16(r3,r4); + //r8= _mm_set_epi32(0,0,0,-1); + + //_mm_maskmoveu_si128(r3,r8,(char *) (dst)); + *((uint32_t*)(dst)) = _mm_cvtsi128_si32(r3); + + r3= _mm_srli_si128(r3,4); + //_mm_maskmoveu_si128(r3,r8,(char *) (dst+stride)); + *((uint32_t*)(dst+stride)) = _mm_cvtsi128_si32(r3); + + r3= _mm_srli_si128(r3,4); + //_mm_maskmoveu_si128(r3,r8,(char *) (dst+2*stride)); + *((uint32_t*)(dst+2*stride)) = _mm_cvtsi128_si32(r3); + + r3= _mm_srli_si128(r3,4); + //_mm_maskmoveu_si128(r3,r8,(char *) (dst+3*stride)); + *((uint32_t*)(dst+3*stride)) = _mm_cvtsi128_si32(r3); +} + + + +#if HAVE_SSE4_1 +void ff_hevc_transform_4x4_luma_add_8_sse4(uint8_t *_dst, const int16_t *coeffs, + ptrdiff_t _stride) { + + uint8_t shift_2nd = 12; // 20 - Bit depth + uint16_t add_2nd = 1 << 11; //(1 << (shift_2nd - 1)) + + uint8_t *dst = (uint8_t*) _dst; + ptrdiff_t stride = _stride; + const int16_t *src = coeffs; + __m128i m128iAdd, S0, S8, m128iTmp1, m128iTmp2, m128iAC, m128iBD, m128iA, + m128iD; + m128iAdd = _mm_set1_epi32(64); + + S0 = _mm_load_si128((__m128i *) (src)); + S8 = _mm_load_si128((__m128i *) (src + 8)); + + m128iAC = _mm_unpacklo_epi16(S0, S8); + m128iBD = _mm_unpackhi_epi16(S0, S8); + + m128iTmp1 = _mm_madd_epi16(m128iAC, + _mm_load_si128((__m128i *) (transform4x4_luma[0]))); + m128iTmp2 = _mm_madd_epi16(m128iBD, + _mm_load_si128((__m128i *) (transform4x4_luma[1]))); + S0 = _mm_add_epi32(m128iTmp1, m128iTmp2); + S0 = _mm_add_epi32(S0, m128iAdd); + S0 = _mm_srai_epi32(S0, shift_1st); + + m128iTmp1 = _mm_madd_epi16(m128iAC, + _mm_load_si128((__m128i *) (transform4x4_luma[2]))); + m128iTmp2 = _mm_madd_epi16(m128iBD, + _mm_load_si128((__m128i *) (transform4x4_luma[3]))); + S8 = _mm_add_epi32(m128iTmp1, m128iTmp2); + S8 = _mm_add_epi32(S8, m128iAdd); + S8 = _mm_srai_epi32(S8, shift_1st); + + m128iA = _mm_packs_epi32(S0, S8); + + m128iTmp1 = _mm_madd_epi16(m128iAC, + _mm_load_si128((__m128i *) (transform4x4_luma[4]))); + m128iTmp2 = _mm_madd_epi16(m128iBD, + _mm_load_si128((__m128i *) (transform4x4_luma[5]))); + S0 = _mm_add_epi32(m128iTmp1, m128iTmp2); + S0 = _mm_add_epi32(S0, m128iAdd); + S0 = _mm_srai_epi32(S0, shift_1st); + + m128iTmp1 = _mm_madd_epi16(m128iAC, + _mm_load_si128((__m128i *) (transform4x4_luma[6]))); + m128iTmp2 = _mm_madd_epi16(m128iBD, + _mm_load_si128((__m128i *) (transform4x4_luma[7]))); + S8 = _mm_add_epi32(m128iTmp1, m128iTmp2); + S8 = _mm_add_epi32(S8, m128iAdd); + S8 = _mm_srai_epi32(S8, shift_1st); + + m128iD = _mm_packs_epi32(S0, S8); + + S0 = _mm_unpacklo_epi16(m128iA, m128iD); + S8 = _mm_unpackhi_epi16(m128iA, m128iD); + + m128iA = _mm_unpacklo_epi16(S0, S8); + m128iD = _mm_unpackhi_epi16(S0, S8); + + /* ################### */ + m128iAdd = _mm_set1_epi32(add_2nd); + + m128iAC = _mm_unpacklo_epi16(m128iA, m128iD); + m128iBD = _mm_unpackhi_epi16(m128iA, m128iD); + + m128iTmp1 = _mm_madd_epi16(m128iAC, + _mm_load_si128((__m128i *) (transform4x4_luma[0]))); + m128iTmp2 = _mm_madd_epi16(m128iBD, + _mm_load_si128((__m128i *) (transform4x4_luma[1]))); + S0 = _mm_add_epi32(m128iTmp1, m128iTmp2); + S0 = _mm_add_epi32(S0, m128iAdd); + S0 = _mm_srai_epi32(S0, shift_2nd); + + m128iTmp1 = _mm_madd_epi16(m128iAC, + _mm_load_si128((__m128i *) (transform4x4_luma[2]))); + m128iTmp2 = _mm_madd_epi16(m128iBD, + _mm_load_si128((__m128i *) (transform4x4_luma[3]))); + S8 = _mm_add_epi32(m128iTmp1, m128iTmp2); + S8 = _mm_add_epi32(S8, m128iAdd); + S8 = _mm_srai_epi32(S8, shift_2nd); + + m128iA = _mm_packs_epi32(S0, S8); + + m128iTmp1 = _mm_madd_epi16(m128iAC, + _mm_load_si128((__m128i *) (transform4x4_luma[4]))); + m128iTmp2 = _mm_madd_epi16(m128iBD, + _mm_load_si128((__m128i *) (transform4x4_luma[5]))); + S0 = _mm_add_epi32(m128iTmp1, m128iTmp2); + S0 = _mm_add_epi32(S0, m128iAdd); + S0 = _mm_srai_epi32(S0, shift_2nd); + + m128iTmp1 = _mm_madd_epi16(m128iAC, + _mm_load_si128((__m128i *) (transform4x4_luma[6]))); + m128iTmp2 = _mm_madd_epi16(m128iBD, + _mm_load_si128((__m128i *) (transform4x4_luma[7]))); + S8 = _mm_add_epi32(m128iTmp1, m128iTmp2); + S8 = _mm_add_epi32(S8, m128iAdd); + S8 = _mm_srai_epi32(S8, shift_2nd); + + m128iD = _mm_packs_epi32(S0, S8); + +// _mm_storeu_si128((__m128i *) (src), m128iA); +// _mm_storeu_si128((__m128i *) (src + 8), m128iD); + + S0 = _mm_move_epi64(m128iA); //contains row 0 + S8 = _mm_move_epi64(m128iD); //row 2 + m128iA = _mm_srli_si128(m128iA, 8); // row 1 + m128iD = _mm_srli_si128(m128iD, 8); // row 3 + m128iTmp1 = _mm_unpacklo_epi16(S0, m128iA); + m128iTmp2 = _mm_unpacklo_epi16(S8, m128iD); + S0 = _mm_unpacklo_epi32(m128iTmp1, m128iTmp2); + S8 = _mm_unpackhi_epi32(m128iTmp1, m128iTmp2); + + //m128iTmp2 = _mm_set_epi32(0, 0, 0, -1); //mask to store 4 * 8bit data + + m128iA = _mm_loadl_epi64((__m128i *) dst); + m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128()); + m128iTmp1 = _mm_adds_epi16(S0, m128iA); //contains first 4 values + m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128()); + //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst); + *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1); + + dst += stride; + + m128iA = _mm_loadl_epi64((__m128i *) dst); + m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128()); + m128iTmp1 = _mm_adds_epi16(_mm_srli_si128(S0, 8), m128iA); + m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128()); + //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst); + *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1); + + dst += stride; + + m128iA = _mm_loadl_epi64((__m128i *) dst); + m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128()); + m128iTmp1 = _mm_adds_epi16(S8, m128iA); + m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128()); + //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst); + *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1); + + dst += stride; + + m128iA = _mm_loadl_epi64((__m128i *) dst); + m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128()); + m128iTmp1 = _mm_adds_epi16(_mm_srli_si128(S8, 8), m128iA); + m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128()); + //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst); + *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1); +} +#endif // SSE4.1 + +#if 0 +void ff_hevc_transform_4x4_luma_add_10_sse4(uint8_t *_dst, const int16_t *coeffs, + ptrdiff_t _stride) { + int i,j; + uint8_t shift_2nd = 10; // 20 - Bit depth + uint16_t add_2nd = 1 << 9; //(1 << (shift_2nd - 1)) + + uint16_t *dst = (uint16_t*) _dst; + ptrdiff_t stride = _stride/(sizeof(uint16_t)); + int16_t *src = coeffs; + __m128i m128iAdd, S0, S8, m128iTmp1, m128iTmp2, m128iAC, m128iBD, m128iA, + m128iD; + + m128iAdd = _mm_set1_epi32(64); + + S0 = _mm_loadu_si128((__m128i *) (src)); + S8 = _mm_loadu_si128((__m128i *) (src + 8)); + + m128iAC = _mm_unpacklo_epi16(S0, S8); + m128iBD = _mm_unpackhi_epi16(S0, S8); + + m128iTmp1 = _mm_madd_epi16(m128iAC, + _mm_loadu_si128((__m128i *) (transform4x4_luma[0]))); + m128iTmp2 = _mm_madd_epi16(m128iBD, + _mm_loadu_si128((__m128i *) (transform4x4_luma[1]))); + S0 = _mm_add_epi32(m128iTmp1, m128iTmp2); + S0 = _mm_add_epi32(S0, m128iAdd); + S0 = _mm_srai_epi32(S0, shift_1st); + + m128iTmp1 = _mm_madd_epi16(m128iAC, + _mm_loadu_si128((__m128i *) (transform4x4_luma[2]))); + m128iTmp2 = _mm_madd_epi16(m128iBD, + _mm_loadu_si128((__m128i *) (transform4x4_luma[3]))); + S8 = _mm_add_epi32(m128iTmp1, m128iTmp2); + S8 = _mm_add_epi32(S8, m128iAdd); + S8 = _mm_srai_epi32(S8, shift_1st); + + m128iA = _mm_packs_epi32(S0, S8); + + m128iTmp1 = _mm_madd_epi16(m128iAC, + _mm_loadu_si128((__m128i *) (transform4x4_luma[4]))); + m128iTmp2 = _mm_madd_epi16(m128iBD, + _mm_loadu_si128((__m128i *) (transform4x4_luma[5]))); + S0 = _mm_add_epi32(m128iTmp1, m128iTmp2); + S0 = _mm_add_epi32(S0, m128iAdd); + S0 = _mm_srai_epi32(S0, shift_1st); + + m128iTmp1 = _mm_madd_epi16(m128iAC, + _mm_loadu_si128((__m128i *) (transform4x4_luma[6]))); + m128iTmp2 = _mm_madd_epi16(m128iBD, + _mm_loadu_si128((__m128i *) (transform4x4_luma[7]))); + S8 = _mm_add_epi32(m128iTmp1, m128iTmp2); + S8 = _mm_add_epi32(S8, m128iAdd); + S8 = _mm_srai_epi32(S8, shift_1st); + + m128iD = _mm_packs_epi32(S0, S8); + + S0 = _mm_unpacklo_epi16(m128iA, m128iD); + S8 = _mm_unpackhi_epi16(m128iA, m128iD); + + m128iA = _mm_unpacklo_epi16(S0, S8); + m128iD = _mm_unpackhi_epi16(S0, S8); + + /* ################### */ + m128iAdd = _mm_set1_epi32(add_2nd); + + m128iAC = _mm_unpacklo_epi16(m128iA, m128iD); + m128iBD = _mm_unpackhi_epi16(m128iA, m128iD); + + m128iTmp1 = _mm_madd_epi16(m128iAC, + _mm_load_si128((__m128i *) (transform4x4_luma[0]))); + m128iTmp2 = _mm_madd_epi16(m128iBD, + _mm_load_si128((__m128i *) (transform4x4_luma[1]))); + S0 = _mm_add_epi32(m128iTmp1, m128iTmp2); + S0 = _mm_add_epi32(S0, m128iAdd); + S0 = _mm_srai_epi32(S0, shift_2nd); + + m128iTmp1 = _mm_madd_epi16(m128iAC, + _mm_load_si128((__m128i *) (transform4x4_luma[2]))); + m128iTmp2 = _mm_madd_epi16(m128iBD, + _mm_load_si128((__m128i *) (transform4x4_luma[3]))); + S8 = _mm_add_epi32(m128iTmp1, m128iTmp2); + S8 = _mm_add_epi32(S8, m128iAdd); + S8 = _mm_srai_epi32(S8, shift_2nd); + + m128iA = _mm_packs_epi32(S0, S8); + + m128iTmp1 = _mm_madd_epi16(m128iAC, + _mm_load_si128((__m128i *) (transform4x4_luma[4]))); + m128iTmp2 = _mm_madd_epi16(m128iBD, + _mm_load_si128((__m128i *) (transform4x4_luma[5]))); + S0 = _mm_add_epi32(m128iTmp1, m128iTmp2); + S0 = _mm_add_epi32(S0, m128iAdd); + S0 = _mm_srai_epi32(S0, shift_2nd); + + m128iTmp1 = _mm_madd_epi16(m128iAC, + _mm_load_si128((__m128i *) (transform4x4_luma[6]))); + m128iTmp2 = _mm_madd_epi16(m128iBD, + _mm_load_si128((__m128i *) (transform4x4_luma[7]))); + S8 = _mm_add_epi32(m128iTmp1, m128iTmp2); + S8 = _mm_add_epi32(S8, m128iAdd); + S8 = _mm_srai_epi32(S8, shift_2nd); + + m128iD = _mm_packs_epi32(S0, S8); + + _mm_storeu_si128((__m128i *) (src), m128iA); + _mm_storeu_si128((__m128i *) (src + 8), m128iD); + j = 0; + for (i = 0; i < 2; i++) { + dst[0] = av_clip_uintp2(dst[0] + src[j],10); + dst[1] = av_clip_uintp2(dst[1] + src[j + 4],10); + dst[2] = av_clip_uintp2(dst[2] + src[j + 8],10); + dst[3] = av_clip_uintp2(dst[3] + src[j + 12],10); + j += 1; + dst += stride; + dst[0] = av_clip_uintp2(dst[0] + src[j],10); + dst[1] = av_clip_uintp2(dst[1] + src[j + 4],10); + dst[2] = av_clip_uintp2(dst[2] + src[j + 8],10); + dst[3] = av_clip_uintp2(dst[3] + src[j + 12],10); + j += 1; + dst += stride; + } + +} +#endif + + +#if HAVE_SSE4_1 +void ff_hevc_transform_4x4_add_8_sse4(uint8_t *_dst, const int16_t *coeffs, + ptrdiff_t _stride) { + uint8_t shift_2nd = 12; // 20 - Bit depth + uint16_t add_2nd = 1 << 11; //(1 << (shift_2nd - 1)) + + uint8_t *dst = (uint8_t*) _dst; + ptrdiff_t stride = _stride; + const int16_t *src = coeffs; + + __m128i S0, S8, m128iAdd, m128Tmp, E1, E2, O1, O2, m128iA, m128iD, m128iTmp1,m128iTmp2; + S0 = _mm_load_si128((__m128i *) (src)); + S8 = _mm_load_si128((__m128i *) (src + 8)); + m128iAdd = _mm_set1_epi32(add_1st); + + m128Tmp = _mm_unpacklo_epi16(S0, S8); + E1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[0]))); + E1 = _mm_add_epi32(E1, m128iAdd); + + E2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[1]))); + E2 = _mm_add_epi32(E2, m128iAdd); + + m128Tmp = _mm_unpackhi_epi16(S0, S8); + O1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[2]))); + O2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[3]))); + + m128iA = _mm_add_epi32(E1, O1); + m128iA = _mm_srai_epi32(m128iA, shift_1st); // Sum = Sum >> iShiftNum + m128Tmp = _mm_add_epi32(E2, O2); + m128Tmp = _mm_srai_epi32(m128Tmp, shift_1st); // Sum = Sum >> iShiftNum + m128iA = _mm_packs_epi32(m128iA, m128Tmp); + + m128iD = _mm_sub_epi32(E2, O2); + m128iD = _mm_srai_epi32(m128iD, shift_1st); // Sum = Sum >> iShiftNum + + m128Tmp = _mm_sub_epi32(E1, O1); + m128Tmp = _mm_srai_epi32(m128Tmp, shift_1st); // Sum = Sum >> iShiftNum + + m128iD = _mm_packs_epi32(m128iD, m128Tmp); + + S0 = _mm_unpacklo_epi16(m128iA, m128iD); + S8 = _mm_unpackhi_epi16(m128iA, m128iD); + + m128iA = _mm_unpacklo_epi16(S0, S8); + m128iD = _mm_unpackhi_epi16(S0, S8); + + /* ########################## */ + + m128iAdd = _mm_set1_epi32(add_2nd); + m128Tmp = _mm_unpacklo_epi16(m128iA, m128iD); + E1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[0]))); + E1 = _mm_add_epi32(E1, m128iAdd); + + E2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[1]))); + E2 = _mm_add_epi32(E2, m128iAdd); + + m128Tmp = _mm_unpackhi_epi16(m128iA, m128iD); + O1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[2]))); + O2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[3]))); + + m128iA = _mm_add_epi32(E1, O1); + m128iA = _mm_srai_epi32(m128iA, shift_2nd); + m128Tmp = _mm_add_epi32(E2, O2); + m128Tmp = _mm_srai_epi32(m128Tmp, shift_2nd); + m128iA = _mm_packs_epi32(m128iA, m128Tmp); + + m128iD = _mm_sub_epi32(E2, O2); + m128iD = _mm_srai_epi32(m128iD, shift_2nd); + + m128Tmp = _mm_sub_epi32(E1, O1); + m128Tmp = _mm_srai_epi32(m128Tmp, shift_2nd); + + m128iD = _mm_packs_epi32(m128iD, m128Tmp); + + S0 = _mm_move_epi64(m128iA); //contains row 0 + S8 = _mm_move_epi64(m128iD); //row 2 + m128iA = _mm_srli_si128(m128iA, 8); // row 1 + m128iD = _mm_srli_si128(m128iD, 8); // row 3 + m128iTmp1 = _mm_unpacklo_epi16(S0, m128iA); + m128iTmp2 = _mm_unpacklo_epi16(S8, m128iD); + S0 = _mm_unpacklo_epi32(m128iTmp1, m128iTmp2); + S8 = _mm_unpackhi_epi32(m128iTmp1, m128iTmp2); + + //m128iTmp2 = _mm_set_epi32(0, 0, 0, -1); //mask to store 4 * 8bit data + + m128iA = _mm_loadl_epi64((__m128i *) dst); + m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128()); + m128iTmp1 = _mm_adds_epi16(S0, m128iA); //contains first 4 values + m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128()); + //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst); + *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1); + + dst += stride; + + m128iA = _mm_loadl_epi64((__m128i *) dst); + m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128()); + m128iTmp1 = _mm_adds_epi16(_mm_srli_si128(S0, 8), m128iA); + m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128()); + //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst); + *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1); + + dst += stride; + + m128iA = _mm_loadl_epi64((__m128i *) dst); + m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128()); + m128iTmp1 = _mm_adds_epi16(S8, m128iA); + m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128()); + //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst); + *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1); + + dst += stride; + + m128iA = _mm_loadl_epi64((__m128i *) dst); + m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128()); + m128iTmp1 = _mm_adds_epi16(_mm_srli_si128(S8, 8), m128iA); + m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128()); + //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst); + *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1); +} +#endif + +#if 0 +void ff_hevc_transform_4x4_add_10_sse4(uint8_t *_dst, const int16_t *coeffs, + ptrdiff_t _stride) { + int i; + uint8_t shift_2nd = 10; // 20 - Bit depth + uint16_t add_2nd = 1 << 9; //(1 << (shift_2nd - 1)) + + uint16_t *dst = (uint16_t*) _dst; + ptrdiff_t stride = _stride/2; + int16_t *src = coeffs; + + int j; + __m128i S0, S8, m128iAdd, m128Tmp, E1, E2, O1, O2, m128iA, m128iD; + S0 = _mm_load_si128((__m128i *) (src)); + S8 = _mm_load_si128((__m128i *) (src + 8)); + m128iAdd = _mm_set1_epi32(add_1st); + + m128Tmp = _mm_unpacklo_epi16(S0, S8); + E1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[0]))); + E1 = _mm_add_epi32(E1, m128iAdd); + + E2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[1]))); + E2 = _mm_add_epi32(E2, m128iAdd); + + m128Tmp = _mm_unpackhi_epi16(S0, S8); + O1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[2]))); + O2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[3]))); + + m128iA = _mm_add_epi32(E1, O1); + m128iA = _mm_srai_epi32(m128iA, shift_1st); // Sum = Sum >> iShiftNum + m128Tmp = _mm_add_epi32(E2, O2); + m128Tmp = _mm_srai_epi32(m128Tmp, shift_1st); // Sum = Sum >> iShiftNum + m128iA = _mm_packs_epi32(m128iA, m128Tmp); + + m128iD = _mm_sub_epi32(E2, O2); + m128iD = _mm_srai_epi32(m128iD, shift_1st); // Sum = Sum >> iShiftNum + + m128Tmp = _mm_sub_epi32(E1, O1); + m128Tmp = _mm_srai_epi32(m128Tmp, shift_1st); // Sum = Sum >> iShiftNum + + m128iD = _mm_packs_epi32(m128iD, m128Tmp); + + S0 = _mm_unpacklo_epi16(m128iA, m128iD); + S8 = _mm_unpackhi_epi16(m128iA, m128iD); + + m128iA = _mm_unpacklo_epi16(S0, S8); + m128iD = _mm_unpackhi_epi16(S0, S8); + + /* ########################## */ + + m128iAdd = _mm_set1_epi32(add_2nd); + m128Tmp = _mm_unpacklo_epi16(m128iA, m128iD); + E1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[0]))); + E1 = _mm_add_epi32(E1, m128iAdd); + + E2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[1]))); + E2 = _mm_add_epi32(E2, m128iAdd); + + m128Tmp = _mm_unpackhi_epi16(m128iA, m128iD); + O1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[2]))); + O2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[3]))); + + m128iA = _mm_add_epi32(E1, O1); + m128iA = _mm_srai_epi32(m128iA, shift_2nd); + m128Tmp = _mm_add_epi32(E2, O2); + m128Tmp = _mm_srai_epi32(m128Tmp, shift_2nd); + m128iA = _mm_packs_epi32(m128iA, m128Tmp); + + m128iD = _mm_sub_epi32(E2, O2); + m128iD = _mm_srai_epi32(m128iD, shift_2nd); + + m128Tmp = _mm_sub_epi32(E1, O1); + m128Tmp = _mm_srai_epi32(m128Tmp, shift_2nd); + + m128iD = _mm_packs_epi32(m128iD, m128Tmp); + _mm_storeu_si128((__m128i *) (src), m128iA); + _mm_storeu_si128((__m128i *) (src + 8), m128iD); + j = 0; + for (i = 0; i < 2; i++) { + dst[0] = av_clip_uintp2(dst[0] + src[j],10); + dst[1] = av_clip_uintp2(dst[1] + src[j + 4],10); + dst[2] = av_clip_uintp2(dst[2] + src[j + 8],10); + dst[3] = av_clip_uintp2(dst[3] + src[j + 12],10); + j += 1; + dst += stride; + dst[0] = av_clip_uintp2(dst[0] + src[j],10); + dst[1] = av_clip_uintp2(dst[1] + src[j + 4],10); + dst[2] = av_clip_uintp2(dst[2] + src[j + 8],10); + dst[3] = av_clip_uintp2(dst[3] + src[j + 12],10); + j += 1; + dst += stride; + } +} +#endif + +#if HAVE_SSE4_1 +void ff_hevc_transform_8x8_add_8_sse4(uint8_t *_dst, const int16_t *coeffs, + ptrdiff_t _stride) { + uint8_t shift_2nd = 12; // 20 - Bit depth + uint16_t add_2nd = 1 << 11; //(1 << (shift_2nd - 1)) + + uint8_t *dst = (uint8_t*) _dst; + ptrdiff_t stride = _stride / sizeof(uint8_t); + const int16_t *src = coeffs; + __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6, + m128iS7, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2, m128Tmp3, E0h, E1h, + E2h, E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O0l, O1l, O2l, + + O3l, EE0l, EE1l, E00l, E01l, EE0h, EE1h, E00h, E01h, + T0,T1,T2,T3,T4,T5,T6,T7,T8,T9,T10,T11; + T0= _mm_load_si128((__m128i *) (transform8x8[0])); + T1= _mm_load_si128((__m128i *) (transform8x8[1])); + T2= _mm_load_si128((__m128i *) (transform8x8[2])); + T3= _mm_load_si128((__m128i *) (transform8x8[3])); + T4= _mm_load_si128((__m128i *) (transform8x8[4])); + T5= _mm_load_si128((__m128i *) (transform8x8[5])); + T6= _mm_load_si128((__m128i *) (transform8x8[6])); + T7= _mm_load_si128((__m128i *) (transform8x8[7])); + T8= _mm_load_si128((__m128i *) (transform8x8[8])); + T9= _mm_load_si128((__m128i *) (transform8x8[9])); + T10= _mm_load_si128((__m128i *) (transform8x8[10])); + T11= _mm_load_si128((__m128i *) (transform8x8[11])); + + m128iAdd = _mm_set1_epi32(add_1st); + + m128iS1 = _mm_load_si128((__m128i *) (src + 8)); + m128iS3 = _mm_load_si128((__m128i *) (src + 24)); + m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3); + E1l = _mm_madd_epi16(m128Tmp0, T0); + m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3); + E1h = _mm_madd_epi16(m128Tmp1, T0); + m128iS5 = _mm_load_si128((__m128i *) (src + 40)); + m128iS7 = _mm_load_si128((__m128i *) (src + 56)); + m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7); + E2l = _mm_madd_epi16(m128Tmp2, T1); + m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7); + E2h = _mm_madd_epi16(m128Tmp3, T1); + O0l = _mm_add_epi32(E1l, E2l); + O0h = _mm_add_epi32(E1h, E2h); + + E1l = _mm_madd_epi16(m128Tmp0, T2); + E1h = _mm_madd_epi16(m128Tmp1, T2); + E2l = _mm_madd_epi16(m128Tmp2, T3); + E2h = _mm_madd_epi16(m128Tmp3, T3); + + O1l = _mm_add_epi32(E1l, E2l); + O1h = _mm_add_epi32(E1h, E2h); + + E1l = _mm_madd_epi16(m128Tmp0, T4); + E1h = _mm_madd_epi16(m128Tmp1, T4); + E2l = _mm_madd_epi16(m128Tmp2, T5); + E2h = _mm_madd_epi16(m128Tmp3, T5); + O2l = _mm_add_epi32(E1l, E2l); + O2h = _mm_add_epi32(E1h, E2h); + + E1l = _mm_madd_epi16(m128Tmp0, T6); + E1h = _mm_madd_epi16(m128Tmp1, T6); + E2l = _mm_madd_epi16(m128Tmp2, T7); + E2h = _mm_madd_epi16(m128Tmp3, T7); + O3h = _mm_add_epi32(E1h, E2h); + O3l = _mm_add_epi32(E1l, E2l); + + /* ------- */ + + m128iS0 = _mm_load_si128((__m128i *) (src + 0)); + m128iS4 = _mm_load_si128((__m128i *) (src + 32)); + m128Tmp0 = _mm_unpacklo_epi16(m128iS0, m128iS4); + EE0l = _mm_madd_epi16(m128Tmp0, T8); + m128Tmp1 = _mm_unpackhi_epi16(m128iS0, m128iS4); + EE0h = _mm_madd_epi16(m128Tmp1, T8); + + EE1l = _mm_madd_epi16(m128Tmp0, T9); + EE1h = _mm_madd_epi16(m128Tmp1, T9); + + /* ------- */ + + m128iS2 = _mm_load_si128((__m128i *) (src + 16)); + m128iS6 = _mm_load_si128((__m128i *) (src + 48)); + m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6); + E00l = _mm_madd_epi16(m128Tmp0, T10); + m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6); + E00h = _mm_madd_epi16(m128Tmp1, T10); + E01l = _mm_madd_epi16(m128Tmp0, T11); + E01h = _mm_madd_epi16(m128Tmp1, T11); + E0l = _mm_add_epi32(EE0l, E00l); + E0l = _mm_add_epi32(E0l, m128iAdd); + E0h = _mm_add_epi32(EE0h, E00h); + E0h = _mm_add_epi32(E0h, m128iAdd); + E3l = _mm_sub_epi32(EE0l, E00l); + E3l = _mm_add_epi32(E3l, m128iAdd); + E3h = _mm_sub_epi32(EE0h, E00h); + E3h = _mm_add_epi32(E3h, m128iAdd); + + E1l = _mm_add_epi32(EE1l, E01l); + E1l = _mm_add_epi32(E1l, m128iAdd); + E1h = _mm_add_epi32(EE1h, E01h); + E1h = _mm_add_epi32(E1h, m128iAdd); + E2l = _mm_sub_epi32(EE1l, E01l); + E2l = _mm_add_epi32(E2l, m128iAdd); + E2h = _mm_sub_epi32(EE1h, E01h); + E2h = _mm_add_epi32(E2h, m128iAdd); + m128iS0 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift_1st), + _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift_1st)); + m128iS1 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift_1st), + _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift_1st)); + m128iS2 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift_1st), + _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift_1st)); + m128iS3 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift_1st), + _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift_1st)); + m128iS4 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift_1st), + _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift_1st)); + m128iS5 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift_1st), + _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift_1st)); + m128iS6 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift_1st), + _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift_1st)); + m128iS7 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift_1st), + _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift_1st)); + /* Invers matrix */ + + E0l = _mm_unpacklo_epi16(m128iS0, m128iS4); + E1l = _mm_unpacklo_epi16(m128iS1, m128iS5); + E2l = _mm_unpacklo_epi16(m128iS2, m128iS6); + E3l = _mm_unpacklo_epi16(m128iS3, m128iS7); + O0l = _mm_unpackhi_epi16(m128iS0, m128iS4); + O1l = _mm_unpackhi_epi16(m128iS1, m128iS5); + O2l = _mm_unpackhi_epi16(m128iS2, m128iS6); + O3l = _mm_unpackhi_epi16(m128iS3, m128iS7); + m128Tmp0 = _mm_unpacklo_epi16(E0l, E2l); + m128Tmp1 = _mm_unpacklo_epi16(E1l, E3l); + m128iS0 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1); + m128iS1 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1); + m128Tmp2 = _mm_unpackhi_epi16(E0l, E2l); + m128Tmp3 = _mm_unpackhi_epi16(E1l, E3l); + m128iS2 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3); + m128iS3 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3); + m128Tmp0 = _mm_unpacklo_epi16(O0l, O2l); + m128Tmp1 = _mm_unpacklo_epi16(O1l, O3l); + m128iS4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1); + m128iS5 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1); + m128Tmp2 = _mm_unpackhi_epi16(O0l, O2l); + m128Tmp3 = _mm_unpackhi_epi16(O1l, O3l); + m128iS6 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3); + m128iS7 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3); + + m128iAdd = _mm_set1_epi32(add_2nd); + + m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3); + E1l = _mm_madd_epi16(m128Tmp0, T0); + m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3); + E1h = _mm_madd_epi16(m128Tmp1, T0); + m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7); + E2l = _mm_madd_epi16(m128Tmp2, T1); + m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7); + E2h = _mm_madd_epi16(m128Tmp3, T1); + O0l = _mm_add_epi32(E1l, E2l); + O0h = _mm_add_epi32(E1h, E2h); + E1l = _mm_madd_epi16(m128Tmp0, T2); + E1h = _mm_madd_epi16(m128Tmp1, T2); + E2l = _mm_madd_epi16(m128Tmp2, T3); + E2h = _mm_madd_epi16(m128Tmp3, T3); + O1l = _mm_add_epi32(E1l, E2l); + O1h = _mm_add_epi32(E1h, E2h); + E1l = _mm_madd_epi16(m128Tmp0, T4); + E1h = _mm_madd_epi16(m128Tmp1, T4); + E2l = _mm_madd_epi16(m128Tmp2, T5); + E2h = _mm_madd_epi16(m128Tmp3, T5); + O2l = _mm_add_epi32(E1l, E2l); + O2h = _mm_add_epi32(E1h, E2h); + E1l = _mm_madd_epi16(m128Tmp0, T6); + E1h = _mm_madd_epi16(m128Tmp1, T6); + E2l = _mm_madd_epi16(m128Tmp2, T7); + E2h = _mm_madd_epi16(m128Tmp3, T7); + O3h = _mm_add_epi32(E1h, E2h); + O3l = _mm_add_epi32(E1l, E2l); + + m128Tmp0 = _mm_unpacklo_epi16(m128iS0, m128iS4); + EE0l = _mm_madd_epi16(m128Tmp0, T8); + m128Tmp1 = _mm_unpackhi_epi16(m128iS0, m128iS4); + EE0h = _mm_madd_epi16(m128Tmp1, T8); + EE1l = _mm_madd_epi16(m128Tmp0, T9); + EE1h = _mm_madd_epi16(m128Tmp1, T9); + + m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6); + E00l = _mm_madd_epi16(m128Tmp0, T10); + m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6); + E00h = _mm_madd_epi16(m128Tmp1, T10); + E01l = _mm_madd_epi16(m128Tmp0, T11); + E01h = _mm_madd_epi16(m128Tmp1, T11); + E0l = _mm_add_epi32(EE0l, E00l); + E0l = _mm_add_epi32(E0l, m128iAdd); + E0h = _mm_add_epi32(EE0h, E00h); + E0h = _mm_add_epi32(E0h, m128iAdd); + E3l = _mm_sub_epi32(EE0l, E00l); + E3l = _mm_add_epi32(E3l, m128iAdd); + E3h = _mm_sub_epi32(EE0h, E00h); + E3h = _mm_add_epi32(E3h, m128iAdd); + E1l = _mm_add_epi32(EE1l, E01l); + E1l = _mm_add_epi32(E1l, m128iAdd); + E1h = _mm_add_epi32(EE1h, E01h); + E1h = _mm_add_epi32(E1h, m128iAdd); + E2l = _mm_sub_epi32(EE1l, E01l); + E2l = _mm_add_epi32(E2l, m128iAdd); + E2h = _mm_sub_epi32(EE1h, E01h); + E2h = _mm_add_epi32(E2h, m128iAdd); + + m128iS0 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift_2nd), + _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift_2nd)); + m128iS1 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift_2nd), + _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift_2nd)); + m128iS2 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift_2nd), + _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift_2nd)); + m128iS3 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift_2nd), + _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift_2nd)); + m128iS4 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift_2nd), + _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift_2nd)); + m128iS5 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift_2nd), + _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift_2nd)); + m128iS6 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift_2nd), + _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift_2nd)); + m128iS7 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift_2nd), + _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift_2nd)); + + E0l = _mm_unpacklo_epi16(m128iS0, m128iS4); + E1l = _mm_unpacklo_epi16(m128iS1, m128iS5); + E2l = _mm_unpacklo_epi16(m128iS2, m128iS6); + E3l = _mm_unpacklo_epi16(m128iS3, m128iS7); + O0l = _mm_unpackhi_epi16(m128iS0, m128iS4); + O1l = _mm_unpackhi_epi16(m128iS1, m128iS5); + O2l = _mm_unpackhi_epi16(m128iS2, m128iS6); + O3l = _mm_unpackhi_epi16(m128iS3, m128iS7); + m128Tmp0 = _mm_unpacklo_epi16(E0l, E2l); + m128Tmp1 = _mm_unpacklo_epi16(E1l, E3l); + m128iS0 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1); + m128iS1 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1); + m128Tmp2 = _mm_unpackhi_epi16(E0l, E2l); + m128Tmp3 = _mm_unpackhi_epi16(E1l, E3l); + m128iS2 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3); + m128iS3 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3); + m128Tmp0 = _mm_unpacklo_epi16(O0l, O2l); + m128Tmp1 = _mm_unpacklo_epi16(O1l, O3l); + m128iS4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1); + m128iS5 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1); + m128Tmp2 = _mm_unpackhi_epi16(O0l, O2l); + m128Tmp3 = _mm_unpackhi_epi16(O1l, O3l); + m128iS6 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3); + m128iS7 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3); + + E0l = _mm_loadl_epi64((__m128i *) dst); + E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128()); + + E0l = _mm_adds_epi16(E0l, m128iS0); + E0l = _mm_packus_epi16(E0l, _mm_setzero_si128()); + _mm_storel_epi64((__m128i *) dst, E0l); + dst += stride; + + E0l = _mm_loadl_epi64((__m128i *) dst); + E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128()); + + E0l = _mm_adds_epi16(E0l, m128iS1); + E0l = _mm_packus_epi16(E0l, _mm_setzero_si128()); + _mm_storel_epi64((__m128i *) dst, E0l); + dst += stride; + + E0l = _mm_loadl_epi64((__m128i *) dst); + E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128()); + + E0l = _mm_adds_epi16(E0l, m128iS2); + E0l = _mm_packus_epi16(E0l, _mm_setzero_si128()); + _mm_storel_epi64((__m128i *) dst, E0l); + dst += stride; + + E0l = _mm_loadl_epi64((__m128i *) dst); + E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128()); + + E0l = _mm_adds_epi16(E0l, m128iS3); + E0l = _mm_packus_epi16(E0l, _mm_setzero_si128()); + _mm_storel_epi64((__m128i *) dst, E0l); + dst += stride; + + E0l = _mm_loadl_epi64((__m128i *) dst); + E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128()); + + E0l = _mm_adds_epi16(E0l, m128iS4); + E0l = _mm_packus_epi16(E0l, _mm_setzero_si128()); + _mm_storel_epi64((__m128i *) dst, E0l); + dst += stride; + + E0l = _mm_loadl_epi64((__m128i *) dst); + E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128()); + + E0l = _mm_adds_epi16(E0l, m128iS5); + E0l = _mm_packus_epi16(E0l, _mm_setzero_si128()); + _mm_storel_epi64((__m128i *) dst, E0l); + dst += stride; + + E0l = _mm_loadl_epi64((__m128i *) dst); + E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128()); + + E0l = _mm_adds_epi16(E0l, m128iS6); + E0l = _mm_packus_epi16(E0l, _mm_setzero_si128()); + _mm_storel_epi64((__m128i *) dst, E0l); + dst += stride; + + E0l = _mm_loadl_epi64((__m128i *) dst); + E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128()); + + E0l = _mm_adds_epi16(E0l, m128iS7); + E0l = _mm_packus_epi16(E0l, _mm_setzero_si128()); + _mm_storel_epi64((__m128i *) dst, E0l); + dst += stride; + +} +#endif + +#if 0 +void ff_hevc_transform_8x8_add_10_sse4(uint8_t *_dst, const int16_t *coeffs, + ptrdiff_t _stride) { + int i; + uint16_t *dst = (uint16_t*) _dst; + ptrdiff_t stride = _stride / sizeof(uint16_t); + int16_t *src = coeffs; + uint8_t shift_2nd = 10; // 20 - Bit depth + uint16_t add_2nd = 1 << 9; //(1 << (shift_2nd - 1)) + + __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6, + m128iS7, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2, m128Tmp3, E0h, E1h, + E2h, E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O0l, O1l, O2l, + O3l, EE0l, EE1l, E00l, E01l, EE0h, EE1h, E00h, E01h; + int j; + m128iAdd = _mm_set1_epi32(add_1st); + + m128iS1 = _mm_load_si128((__m128i *) (src + 8)); + m128iS3 = _mm_load_si128((__m128i *) (src + 24)); + m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3); + E1l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform8x8[0]))); + m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3); + E1h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform8x8[0]))); + m128iS5 = _mm_load_si128((__m128i *) (src + 40)); + m128iS7 = _mm_load_si128((__m128i *) (src + 56)); + m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7); + E2l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform8x8[1]))); + m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7); + E2h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform8x8[1]))); + O0l = _mm_add_epi32(E1l, E2l); + O0h = _mm_add_epi32(E1h, E2h); + + E1l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform8x8[2]))); + E1h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform8x8[2]))); + E2l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform8x8[3]))); + E2h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform8x8[3]))); + + O1l = _mm_add_epi32(E1l, E2l); + O1h = _mm_add_epi32(E1h, E2h); + + E1l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform8x8[4]))); + E1h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform8x8[4]))); + E2l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform8x8[5]))); + E2h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform8x8[5]))); + O2l = _mm_add_epi32(E1l, E2l); + O2h = _mm_add_epi32(E1h, E2h); + + E1l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform8x8[6]))); + E1h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform8x8[6]))); + E2l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform8x8[7]))); + E2h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform8x8[7]))); + O3h = _mm_add_epi32(E1h, E2h); + O3l = _mm_add_epi32(E1l, E2l); + + /* ------- */ + + m128iS0 = _mm_load_si128((__m128i *) (src + 0)); + m128iS4 = _mm_load_si128((__m128i *) (src + 32)); + m128Tmp0 = _mm_unpacklo_epi16(m128iS0, m128iS4); + EE0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform8x8[8]))); + m128Tmp1 = _mm_unpackhi_epi16(m128iS0, m128iS4); + EE0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform8x8[8]))); + + EE1l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform8x8[9]))); + EE1h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform8x8[9]))); + + /* ------- */ + + m128iS2 = _mm_load_si128((__m128i *) (src + 16)); + m128iS6 = _mm_load_si128((__m128i *) (src + 48)); + m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6); + E00l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform8x8[10]))); + m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6); + E00h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform8x8[10]))); + E01l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform8x8[11]))); + E01h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform8x8[11]))); + E0l = _mm_add_epi32(EE0l, E00l); + E0l = _mm_add_epi32(E0l, m128iAdd); + E0h = _mm_add_epi32(EE0h, E00h); + E0h = _mm_add_epi32(E0h, m128iAdd); + E3l = _mm_sub_epi32(EE0l, E00l); + E3l = _mm_add_epi32(E3l, m128iAdd); + E3h = _mm_sub_epi32(EE0h, E00h); + E3h = _mm_add_epi32(E3h, m128iAdd); + + E1l = _mm_add_epi32(EE1l, E01l); + E1l = _mm_add_epi32(E1l, m128iAdd); + E1h = _mm_add_epi32(EE1h, E01h); + E1h = _mm_add_epi32(E1h, m128iAdd); + E2l = _mm_sub_epi32(EE1l, E01l); + E2l = _mm_add_epi32(E2l, m128iAdd); + E2h = _mm_sub_epi32(EE1h, E01h); + E2h = _mm_add_epi32(E2h, m128iAdd); + m128iS0 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift_1st), + _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift_1st)); + m128iS1 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift_1st), + _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift_1st)); + m128iS2 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift_1st), + _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift_1st)); + m128iS3 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift_1st), + _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift_1st)); + m128iS4 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift_1st), + _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift_1st)); + m128iS5 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift_1st), + _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift_1st)); + m128iS6 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift_1st), + _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift_1st)); + m128iS7 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift_1st), + _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift_1st)); + /* Invers matrix */ + + E0l = _mm_unpacklo_epi16(m128iS0, m128iS4); + E1l = _mm_unpacklo_epi16(m128iS1, m128iS5); + E2l = _mm_unpacklo_epi16(m128iS2, m128iS6); + E3l = _mm_unpacklo_epi16(m128iS3, m128iS7); + O0l = _mm_unpackhi_epi16(m128iS0, m128iS4); + O1l = _mm_unpackhi_epi16(m128iS1, m128iS5); + O2l = _mm_unpackhi_epi16(m128iS2, m128iS6); + O3l = _mm_unpackhi_epi16(m128iS3, m128iS7); + m128Tmp0 = _mm_unpacklo_epi16(E0l, E2l); + m128Tmp1 = _mm_unpacklo_epi16(E1l, E3l); + m128iS0 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1); + m128iS1 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1); + m128Tmp2 = _mm_unpackhi_epi16(E0l, E2l); + m128Tmp3 = _mm_unpackhi_epi16(E1l, E3l); + m128iS2 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3); + m128iS3 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3); + m128Tmp0 = _mm_unpacklo_epi16(O0l, O2l); + m128Tmp1 = _mm_unpacklo_epi16(O1l, O3l); + m128iS4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1); + m128iS5 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1); + m128Tmp2 = _mm_unpackhi_epi16(O0l, O2l); + m128Tmp3 = _mm_unpackhi_epi16(O1l, O3l); + m128iS6 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3); + m128iS7 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3); + + m128iAdd = _mm_set1_epi32(add_2nd); + + m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3); + E1l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform8x8[0]))); + m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3); + E1h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform8x8[0]))); + m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7); + E2l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform8x8[1]))); + m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7); + E2h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform8x8[1]))); + O0l = _mm_add_epi32(E1l, E2l); + O0h = _mm_add_epi32(E1h, E2h); + E1l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform8x8[2]))); + E1h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform8x8[2]))); + E2l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform8x8[3]))); + E2h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform8x8[3]))); + O1l = _mm_add_epi32(E1l, E2l); + O1h = _mm_add_epi32(E1h, E2h); + E1l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform8x8[4]))); + E1h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform8x8[4]))); + E2l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform8x8[5]))); + E2h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform8x8[5]))); + O2l = _mm_add_epi32(E1l, E2l); + O2h = _mm_add_epi32(E1h, E2h); + E1l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform8x8[6]))); + E1h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform8x8[6]))); + E2l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform8x8[7]))); + E2h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform8x8[7]))); + O3h = _mm_add_epi32(E1h, E2h); + O3l = _mm_add_epi32(E1l, E2l); + + m128Tmp0 = _mm_unpacklo_epi16(m128iS0, m128iS4); + EE0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform8x8[8]))); + m128Tmp1 = _mm_unpackhi_epi16(m128iS0, m128iS4); + EE0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform8x8[8]))); + EE1l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform8x8[9]))); + EE1h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform8x8[9]))); + + m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6); + E00l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform8x8[10]))); + m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6); + E00h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform8x8[10]))); + E01l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform8x8[11]))); + E01h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform8x8[11]))); + E0l = _mm_add_epi32(EE0l, E00l); + E0l = _mm_add_epi32(E0l, m128iAdd); + E0h = _mm_add_epi32(EE0h, E00h); + E0h = _mm_add_epi32(E0h, m128iAdd); + E3l = _mm_sub_epi32(EE0l, E00l); + E3l = _mm_add_epi32(E3l, m128iAdd); + E3h = _mm_sub_epi32(EE0h, E00h); + E3h = _mm_add_epi32(E3h, m128iAdd); + E1l = _mm_add_epi32(EE1l, E01l); + E1l = _mm_add_epi32(E1l, m128iAdd); + E1h = _mm_add_epi32(EE1h, E01h); + E1h = _mm_add_epi32(E1h, m128iAdd); + E2l = _mm_sub_epi32(EE1l, E01l); + E2l = _mm_add_epi32(E2l, m128iAdd); + E2h = _mm_sub_epi32(EE1h, E01h); + E2h = _mm_add_epi32(E2h, m128iAdd); + + m128iS0 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift_2nd), + _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift_2nd)); + m128iS1 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift_2nd), + _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift_2nd)); + m128iS2 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift_2nd), + _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift_2nd)); + m128iS3 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift_2nd), + _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift_2nd)); + m128iS4 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift_2nd), + _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift_2nd)); + m128iS5 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift_2nd), + _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift_2nd)); + m128iS6 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift_2nd), + _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift_2nd)); + m128iS7 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift_2nd), + _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift_2nd)); + + _mm_store_si128((__m128i *) (src), m128iS0); + _mm_store_si128((__m128i *) (src + 8), m128iS1); + _mm_store_si128((__m128i *) (src + 16), m128iS2); + _mm_store_si128((__m128i *) (src + 24), m128iS3); + _mm_store_si128((__m128i *) (src + 32), m128iS4); + _mm_store_si128((__m128i *) (src + 40), m128iS5); + _mm_store_si128((__m128i *) (src + 48), m128iS6); + _mm_store_si128((__m128i *) (src + 56), m128iS7); + + j = 0; + for (i = 0; i < 4; i++) { + dst[0] = av_clip_uintp2(dst[0] + src[j],10); + dst[1] = av_clip_uintp2(dst[1] + src[j + 8],10); + dst[2] = av_clip_uintp2(dst[2] + src[j + 16],10); + dst[3] = av_clip_uintp2(dst[3] + src[j + 24],10); + dst[4] = av_clip_uintp2(dst[4] + src[j + 32],10); + dst[5] = av_clip_uintp2(dst[5] + src[j + 40],10); + dst[6] = av_clip_uintp2(dst[6] + src[j + 48],10); + dst[7] = av_clip_uintp2(dst[7] + src[j + 56],10); + j += 1; + dst += stride; + dst[0] = av_clip_uintp2(dst[0] + src[j],10); + dst[1] = av_clip_uintp2(dst[1] + src[j + 8],10); + dst[2] = av_clip_uintp2(dst[2] + src[j + 16],10); + dst[3] = av_clip_uintp2(dst[3] + src[j + 24],10); + dst[4] = av_clip_uintp2(dst[4] + src[j + 32],10); + dst[5] = av_clip_uintp2(dst[5] + src[j + 40],10); + dst[6] = av_clip_uintp2(dst[6] + src[j + 48],10); + dst[7] = av_clip_uintp2(dst[7] + src[j + 56],10); + j += 1; + dst += stride; + } + +} +#endif + + +#if HAVE_SSE4_1 +void ff_hevc_transform_16x16_add_8_sse4(uint8_t *_dst, const int16_t *coeffs, + ptrdiff_t _stride) { + uint8_t shift_2nd = 12; // 20 - Bit depth + uint16_t add_2nd = 1 << 11; //(1 << (shift_2nd - 1)) + int i; + uint8_t *dst = (uint8_t*) _dst; + ptrdiff_t stride = _stride / sizeof(uint8_t); + const int16_t *src = coeffs; + int32_t shift; + __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6, + m128iS7, m128iS8, m128iS9, m128iS10, m128iS11, m128iS12, m128iS13, + m128iS14, m128iS15, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2, + m128Tmp3, m128Tmp4, m128Tmp5, m128Tmp6, m128Tmp7, E0h, E1h, E2h, + E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O4h, O5h, O6h, O7h, + O0l, O1l, O2l, O3l, O4l, O5l, O6l, O7l, EE0l, EE1l, EE2l, EE3l, + E00l, E01l, EE0h, EE1h, EE2h, EE3h, E00h, E01h; + __m128i E4l, E5l, E6l, E7l; + __m128i E4h, E5h, E6h, E7h; + __m128i r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,r13,r14,r15; + __m128i r16,r17,r18,r19,r20,r21,r22,r23,r24,r25,r26,r27,r28,r29,r30,r31; + + + /*__m128i T00,T01, T02, T03, T04, T05, T06, T07; + __m128i T10,T11, T12, T13, T14, T15, T16, T17; + __m128i T20,T21, T22, T23, T24, T25, T26, T27; + __m128i T30,T31, T32, T33, T34, T35, T36, T37; + + __m128i U00,U01, U02, U03, U10, U11, U12, U13; + + __m128i V00,V01, V10, V11;*/ + + + const __m128i T00 = _mm_load_si128((__m128i *) (transform16x16_1[0][0])); + const __m128i T01 = _mm_load_si128((__m128i *) (transform16x16_1[0][1])); + const __m128i T02 = _mm_load_si128((__m128i *) (transform16x16_1[0][2])); + const __m128i T03 = _mm_load_si128((__m128i *) (transform16x16_1[0][3])); + const __m128i T04 = _mm_load_si128((__m128i *) (transform16x16_1[0][4])); + const __m128i T05 = _mm_load_si128((__m128i *) (transform16x16_1[0][5])); + const __m128i T06 = _mm_load_si128((__m128i *) (transform16x16_1[0][6])); + const __m128i T07 = _mm_load_si128((__m128i *) (transform16x16_1[0][7])); + const __m128i T10 = _mm_load_si128((__m128i *) (transform16x16_1[1][0])); + const __m128i T11 = _mm_load_si128((__m128i *) (transform16x16_1[1][1])); + const __m128i T12 = _mm_load_si128((__m128i *) (transform16x16_1[1][2])); + const __m128i T13 = _mm_load_si128((__m128i *) (transform16x16_1[1][3])); + const __m128i T14 = _mm_load_si128((__m128i *) (transform16x16_1[1][4])); + const __m128i T15 = _mm_load_si128((__m128i *) (transform16x16_1[1][5])); + const __m128i T16 = _mm_load_si128((__m128i *) (transform16x16_1[1][6])); + const __m128i T17 = _mm_load_si128((__m128i *) (transform16x16_1[1][7])); + const __m128i T20 = _mm_load_si128((__m128i *) (transform16x16_1[2][0])); + const __m128i T21 = _mm_load_si128((__m128i *) (transform16x16_1[2][1])); + const __m128i T22 = _mm_load_si128((__m128i *) (transform16x16_1[2][2])); + const __m128i T23 = _mm_load_si128((__m128i *) (transform16x16_1[2][3])); + const __m128i T24 = _mm_load_si128((__m128i *) (transform16x16_1[2][4])); + const __m128i T25 = _mm_load_si128((__m128i *) (transform16x16_1[2][5])); + const __m128i T26 = _mm_load_si128((__m128i *) (transform16x16_1[2][6])); + const __m128i T27 = _mm_load_si128((__m128i *) (transform16x16_1[2][7])); + const __m128i T30 = _mm_load_si128((__m128i *) (transform16x16_1[3][0])); + const __m128i T31 = _mm_load_si128((__m128i *) (transform16x16_1[3][1])); + const __m128i T32 = _mm_load_si128((__m128i *) (transform16x16_1[3][2])); + const __m128i T33 = _mm_load_si128((__m128i *) (transform16x16_1[3][3])); + const __m128i T34 = _mm_load_si128((__m128i *) (transform16x16_1[3][4])); + const __m128i T35 = _mm_load_si128((__m128i *) (transform16x16_1[3][5])); + const __m128i T36 = _mm_load_si128((__m128i *) (transform16x16_1[3][6])); + const __m128i T37 = _mm_load_si128((__m128i *) (transform16x16_1[3][7])); + + const __m128i U00 = _mm_load_si128((__m128i *) (transform16x16_2[0][0])); + const __m128i U01 = _mm_load_si128((__m128i *) (transform16x16_2[0][1])); + const __m128i U02 = _mm_load_si128((__m128i *) (transform16x16_2[0][2])); + const __m128i U03 = _mm_load_si128((__m128i *) (transform16x16_2[0][3])); + const __m128i U10 = _mm_load_si128((__m128i *) (transform16x16_2[1][0])); + const __m128i U11 = _mm_load_si128((__m128i *) (transform16x16_2[1][1])); + const __m128i U12 = _mm_load_si128((__m128i *) (transform16x16_2[1][2])); + const __m128i U13 = _mm_load_si128((__m128i *) (transform16x16_2[1][3])); + + const __m128i V00 = _mm_load_si128((__m128i *) (transform16x16_3[0][0])); + const __m128i V01 = _mm_load_si128((__m128i *) (transform16x16_3[0][1])); + const __m128i V10 = _mm_load_si128((__m128i *) (transform16x16_3[1][0])); + const __m128i V11 = _mm_load_si128((__m128i *) (transform16x16_3[1][1])); + + + + int j; + m128iS0 = _mm_load_si128((__m128i *) (src)); + m128iS1 = _mm_load_si128((__m128i *) (src + 16)); + m128iS2 = _mm_load_si128((__m128i *) (src + 32)); + m128iS3 = _mm_load_si128((__m128i *) (src + 48)); + m128iS4 = _mm_loadu_si128((__m128i *) (src + 64)); + m128iS5 = _mm_load_si128((__m128i *) (src + 80)); + m128iS6 = _mm_load_si128((__m128i *) (src + 96)); + m128iS7 = _mm_load_si128((__m128i *) (src + 112)); + m128iS8 = _mm_load_si128((__m128i *) (src + 128)); + m128iS9 = _mm_load_si128((__m128i *) (src + 144)); + m128iS10 = _mm_load_si128((__m128i *) (src + 160)); + m128iS11 = _mm_load_si128((__m128i *) (src + 176)); + m128iS12 = _mm_load_si128((__m128i *) (src + 192)); + m128iS13 = _mm_load_si128((__m128i *) (src + 208)); + m128iS14 = _mm_load_si128((__m128i *) (src + 224)); + m128iS15 = _mm_load_si128((__m128i *) (src + 240)); + shift = shift_1st; + m128iAdd = _mm_set1_epi32(add_1st); + + for (j = 0; j < 2; j++) { + for (i = 0; i < 16; i += 8) { + + m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3); + E0l = _mm_madd_epi16(m128Tmp0,T00); + m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3); + E0h = _mm_madd_epi16(m128Tmp1,T00); + + m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7); + E1l = _mm_madd_epi16(m128Tmp2,T10); + m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7); + E1h = _mm_madd_epi16(m128Tmp3,T10); + + m128Tmp4 = _mm_unpacklo_epi16(m128iS9, m128iS11); + E2l = _mm_madd_epi16(m128Tmp4,T20); + m128Tmp5 = _mm_unpackhi_epi16(m128iS9, m128iS11); + E2h = _mm_madd_epi16(m128Tmp5,T20); + + m128Tmp6 = _mm_unpacklo_epi16(m128iS13, m128iS15); + E3l = _mm_madd_epi16(m128Tmp6,T30); + m128Tmp7 = _mm_unpackhi_epi16(m128iS13, m128iS15); + E3h = _mm_madd_epi16(m128Tmp7,T30); + + O0l = _mm_add_epi32(E0l, E1l); + O0l = _mm_add_epi32(O0l, E2l); + O0l = _mm_add_epi32(O0l, E3l); + + O0h = _mm_add_epi32(E0h, E1h); + O0h = _mm_add_epi32(O0h, E2h); + O0h = _mm_add_epi32(O0h, E3h); + + /* Compute O1*/ + E0l = _mm_madd_epi16(m128Tmp0,T01); + E0h = _mm_madd_epi16(m128Tmp1,T01); + E1l = _mm_madd_epi16(m128Tmp2,T11); + E1h = _mm_madd_epi16(m128Tmp3,T11); + E2l = _mm_madd_epi16(m128Tmp4,T21); + E2h = _mm_madd_epi16(m128Tmp5,T21); + E3l = _mm_madd_epi16(m128Tmp6,T31); + E3h = _mm_madd_epi16(m128Tmp7,T31); + O1l = _mm_add_epi32(E0l, E1l); + O1l = _mm_add_epi32(O1l, E2l); + O1l = _mm_add_epi32(O1l, E3l); + O1h = _mm_add_epi32(E0h, E1h); + O1h = _mm_add_epi32(O1h, E2h); + O1h = _mm_add_epi32(O1h, E3h); + + /* Compute O2*/ + E0l = _mm_madd_epi16(m128Tmp0,T02); + E0h = _mm_madd_epi16(m128Tmp1,T02); + E1l = _mm_madd_epi16(m128Tmp2,T12); + E1h = _mm_madd_epi16(m128Tmp3,T12); + E2l = _mm_madd_epi16(m128Tmp4,T22); + E2h = _mm_madd_epi16(m128Tmp5,T22); + E3l = _mm_madd_epi16(m128Tmp6,T32); + E3h = _mm_madd_epi16(m128Tmp7,T32); + O2l = _mm_add_epi32(E0l, E1l); + O2l = _mm_add_epi32(O2l, E2l); + O2l = _mm_add_epi32(O2l, E3l); + + O2h = _mm_add_epi32(E0h, E1h); + O2h = _mm_add_epi32(O2h, E2h); + O2h = _mm_add_epi32(O2h, E3h); + + /* Compute O3*/ + E0l = _mm_madd_epi16(m128Tmp0,T03); + E0h = _mm_madd_epi16(m128Tmp1,T03); + E1l = _mm_madd_epi16(m128Tmp2,T13); + E1h = _mm_madd_epi16(m128Tmp3,T13); + E2l = _mm_madd_epi16(m128Tmp4,T23); + E2h = _mm_madd_epi16(m128Tmp5,T23); + E3l = _mm_madd_epi16(m128Tmp6,T33); + E3h = _mm_madd_epi16(m128Tmp7,T33); + + O3l = _mm_add_epi32(E0l, E1l); + O3l = _mm_add_epi32(O3l, E2l); + O3l = _mm_add_epi32(O3l, E3l); + + O3h = _mm_add_epi32(E0h, E1h); + O3h = _mm_add_epi32(O3h, E2h); + O3h = _mm_add_epi32(O3h, E3h); + + /* Compute O4*/ + + E0l = _mm_madd_epi16(m128Tmp0,T04); + E0h = _mm_madd_epi16(m128Tmp1,T04); + E1l = _mm_madd_epi16(m128Tmp2,T14); + E1h = _mm_madd_epi16(m128Tmp3,T14); + E2l = _mm_madd_epi16(m128Tmp4,T24); + E2h = _mm_madd_epi16(m128Tmp5,T24); + E3l = _mm_madd_epi16(m128Tmp6,T34); + E3h = _mm_madd_epi16(m128Tmp7,T34); + + O4l = _mm_add_epi32(E0l, E1l); + O4l = _mm_add_epi32(O4l, E2l); + O4l = _mm_add_epi32(O4l, E3l); + + O4h = _mm_add_epi32(E0h, E1h); + O4h = _mm_add_epi32(O4h, E2h); + O4h = _mm_add_epi32(O4h, E3h); + + /* Compute O5*/ + E0l = _mm_madd_epi16(m128Tmp0,T05); + E0h = _mm_madd_epi16(m128Tmp1,T05); + E1l = _mm_madd_epi16(m128Tmp2,T15); + E1h = _mm_madd_epi16(m128Tmp3,T15); + E2l = _mm_madd_epi16(m128Tmp4,T25); + E2h = _mm_madd_epi16(m128Tmp5,T25); + E3l = _mm_madd_epi16(m128Tmp6,T35); + E3h = _mm_madd_epi16(m128Tmp7,T35); + + O5l = _mm_add_epi32(E0l, E1l); + O5l = _mm_add_epi32(O5l, E2l); + O5l = _mm_add_epi32(O5l, E3l); + + O5h = _mm_add_epi32(E0h, E1h); + O5h = _mm_add_epi32(O5h, E2h); + O5h = _mm_add_epi32(O5h, E3h); + + /* Compute O6*/ + + E0l = _mm_madd_epi16(m128Tmp0,T06); + E0h = _mm_madd_epi16(m128Tmp1,T06); + E1l = _mm_madd_epi16(m128Tmp2,T16); + E1h = _mm_madd_epi16(m128Tmp3,T16); + E2l = _mm_madd_epi16(m128Tmp4,T26); + E2h = _mm_madd_epi16(m128Tmp5,T26); + E3l = _mm_madd_epi16(m128Tmp6,T36); + E3h = _mm_madd_epi16(m128Tmp7,T36); + + O6l = _mm_add_epi32(E0l, E1l); + O6l = _mm_add_epi32(O6l, E2l); + O6l = _mm_add_epi32(O6l, E3l); + + O6h = _mm_add_epi32(E0h, E1h); + O6h = _mm_add_epi32(O6h, E2h); + O6h = _mm_add_epi32(O6h, E3h); + + /* Compute O7*/ + + E0l = _mm_madd_epi16(m128Tmp0,T07); + E0h = _mm_madd_epi16(m128Tmp1,T07); + E1l = _mm_madd_epi16(m128Tmp2,T17); + E1h = _mm_madd_epi16(m128Tmp3,T17); + E2l = _mm_madd_epi16(m128Tmp4,T27); + E2h = _mm_madd_epi16(m128Tmp5,T27); + E3l = _mm_madd_epi16(m128Tmp6,T37); + E3h = _mm_madd_epi16(m128Tmp7,T37); + + O7l = _mm_add_epi32(E0l, E1l); + O7l = _mm_add_epi32(O7l, E2l); + O7l = _mm_add_epi32(O7l, E3l); + + O7h = _mm_add_epi32(E0h, E1h); + O7h = _mm_add_epi32(O7h, E2h); + O7h = _mm_add_epi32(O7h, E3h); + + /* Compute E0 */ + + + + m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6); + E0l = _mm_madd_epi16(m128Tmp0,U00); + m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6); + E0h = _mm_madd_epi16(m128Tmp1,U00); + + m128Tmp2 = _mm_unpacklo_epi16(m128iS10, m128iS14); + E0l = _mm_add_epi32(E0l, + _mm_madd_epi16(m128Tmp2,U10)); + m128Tmp3 = _mm_unpackhi_epi16(m128iS10, m128iS14); + E0h = _mm_add_epi32(E0h, + _mm_madd_epi16(m128Tmp3,U10)); + + /* Compute E1 */ + E1l = _mm_madd_epi16(m128Tmp0,U01); + E1h = _mm_madd_epi16(m128Tmp1,U01); + E1l = _mm_add_epi32(E1l, + _mm_madd_epi16(m128Tmp2,U11)); + E1h = _mm_add_epi32(E1h, + _mm_madd_epi16(m128Tmp3,U11)); + + /* Compute E2 */ + E2l = _mm_madd_epi16(m128Tmp0,U02); + E2h = _mm_madd_epi16(m128Tmp1,U02); + E2l = _mm_add_epi32(E2l, + _mm_madd_epi16(m128Tmp2,U12)); + E2h = _mm_add_epi32(E2h, + _mm_madd_epi16(m128Tmp3,U12)); + /* Compute E3 */ + E3l = _mm_madd_epi16(m128Tmp0,U03); + E3h = _mm_madd_epi16(m128Tmp1,U03); + E3l = _mm_add_epi32(E3l, + _mm_madd_epi16(m128Tmp2,U13)); + E3h = _mm_add_epi32(E3h, + _mm_madd_epi16(m128Tmp3,U13)); + + /* Compute EE0 and EEE */ + + m128Tmp0 = _mm_unpacklo_epi16(m128iS4, m128iS12); + E00l = _mm_madd_epi16(m128Tmp0,V00); + m128Tmp1 = _mm_unpackhi_epi16(m128iS4, m128iS12); + E00h = _mm_madd_epi16(m128Tmp1,V00); + + m128Tmp2 = _mm_unpacklo_epi16(m128iS0, m128iS8); + EE0l = _mm_madd_epi16(m128Tmp2,V10); + m128Tmp3 = _mm_unpackhi_epi16(m128iS0, m128iS8); + EE0h = _mm_madd_epi16(m128Tmp3,V10); + + E01l = _mm_madd_epi16(m128Tmp0,V01); + E01h = _mm_madd_epi16(m128Tmp1,V01); + + EE1l = _mm_madd_epi16(m128Tmp2,V11); + EE1h = _mm_madd_epi16(m128Tmp3,V11); + + /* Compute EE */ + EE2l = _mm_sub_epi32(EE1l, E01l); + EE3l = _mm_sub_epi32(EE0l, E00l); + EE2h = _mm_sub_epi32(EE1h, E01h); + EE3h = _mm_sub_epi32(EE0h, E00h); + + EE0l = _mm_add_epi32(EE0l, E00l); + EE1l = _mm_add_epi32(EE1l, E01l); + EE0h = _mm_add_epi32(EE0h, E00h); + EE1h = _mm_add_epi32(EE1h, E01h); + + /* Compute E */ + + E4l = _mm_sub_epi32(EE3l, E3l); + E4l = _mm_add_epi32(E4l, m128iAdd); + + E5l = _mm_sub_epi32(EE2l, E2l); + E5l = _mm_add_epi32(E5l, m128iAdd); + + E6l = _mm_sub_epi32(EE1l, E1l); + E6l = _mm_add_epi32(E6l, m128iAdd); + + E7l = _mm_sub_epi32(EE0l, E0l); + E7l = _mm_add_epi32(E7l, m128iAdd); + + E4h = _mm_sub_epi32(EE3h, E3h); + E4h = _mm_add_epi32(E4h, m128iAdd); + + E5h = _mm_sub_epi32(EE2h, E2h); + E5h = _mm_add_epi32(E5h, m128iAdd); + + E6h = _mm_sub_epi32(EE1h, E1h); + E6h = _mm_add_epi32(E6h, m128iAdd); + + E7h = _mm_sub_epi32(EE0h, E0h); + E7h = _mm_add_epi32(E7h, m128iAdd); + + E0l = _mm_add_epi32(EE0l, E0l); + E0l = _mm_add_epi32(E0l, m128iAdd); + + E1l = _mm_add_epi32(EE1l, E1l); + E1l = _mm_add_epi32(E1l, m128iAdd); + + E2l = _mm_add_epi32(EE2l, E2l); + E2l = _mm_add_epi32(E2l, m128iAdd); + + E3l = _mm_add_epi32(EE3l, E3l); + E3l = _mm_add_epi32(E3l, m128iAdd); + + E0h = _mm_add_epi32(EE0h, E0h); + E0h = _mm_add_epi32(E0h, m128iAdd); + + E1h = _mm_add_epi32(EE1h, E1h); + E1h = _mm_add_epi32(E1h, m128iAdd); + + E2h = _mm_add_epi32(EE2h, E2h); + E2h = _mm_add_epi32(E2h, m128iAdd); + + E3h = _mm_add_epi32(EE3h, E3h); + E3h = _mm_add_epi32(E3h, m128iAdd); + + m128iS0 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift), + _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift)); + m128iS1 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift), + _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift)); + m128iS2 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift), + _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift)); + m128iS3 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift), + _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift)); + + m128iS4 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E4l, O4l), shift), + _mm_srai_epi32(_mm_add_epi32(E4h, O4h), shift)); + m128iS5 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E5l, O5l), shift), + _mm_srai_epi32(_mm_add_epi32(E5h, O5h), shift)); + m128iS6 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E6l, O6l), shift), + _mm_srai_epi32(_mm_add_epi32(E6h, O6h), shift)); + m128iS7 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E7l, O7l), shift), + _mm_srai_epi32(_mm_add_epi32(E7h, O7h), shift)); + + m128iS15 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift), + _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift)); + m128iS14 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift), + _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift)); + m128iS13 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift), + _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift)); + m128iS12 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift), + _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift)); + + m128iS11 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E4l, O4l), shift), + _mm_srai_epi32(_mm_sub_epi32(E4h, O4h), shift)); + m128iS10 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E5l, O5l), shift), + _mm_srai_epi32(_mm_sub_epi32(E5h, O5h), shift)); + m128iS9 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E6l, O6l), shift), + _mm_srai_epi32(_mm_sub_epi32(E6h, O6h), shift)); + m128iS8 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E7l, O7l), shift), + _mm_srai_epi32(_mm_sub_epi32(E7h, O7h), shift)); + + + + if (!j) { //first pass + + /* Inverse the matrix */ + E0l = _mm_unpacklo_epi16(m128iS0, m128iS8); + E1l = _mm_unpacklo_epi16(m128iS1, m128iS9); + E2l = _mm_unpacklo_epi16(m128iS2, m128iS10); + E3l = _mm_unpacklo_epi16(m128iS3, m128iS11); + E4l = _mm_unpacklo_epi16(m128iS4, m128iS12); + E5l = _mm_unpacklo_epi16(m128iS5, m128iS13); + E6l = _mm_unpacklo_epi16(m128iS6, m128iS14); + E7l = _mm_unpacklo_epi16(m128iS7, m128iS15); + + E0h = _mm_unpackhi_epi16(m128iS0, m128iS8); + E1h = _mm_unpackhi_epi16(m128iS1, m128iS9); + E2h = _mm_unpackhi_epi16(m128iS2, m128iS10); + E3h = _mm_unpackhi_epi16(m128iS3, m128iS11); + E4h = _mm_unpackhi_epi16(m128iS4, m128iS12); + E5h = _mm_unpackhi_epi16(m128iS5, m128iS13); + E6h = _mm_unpackhi_epi16(m128iS6, m128iS14); + E7h = _mm_unpackhi_epi16(m128iS7, m128iS15); + + m128Tmp0 = _mm_unpacklo_epi16(E0l, E4l); + m128Tmp1 = _mm_unpacklo_epi16(E1l, E5l); + m128Tmp2 = _mm_unpacklo_epi16(E2l, E6l); + m128Tmp3 = _mm_unpacklo_epi16(E3l, E7l); + + m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); + m128iS0 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS1 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); + m128iS2 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS3 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp0 = _mm_unpackhi_epi16(E0l, E4l); + m128Tmp1 = _mm_unpackhi_epi16(E1l, E5l); + m128Tmp2 = _mm_unpackhi_epi16(E2l, E6l); + m128Tmp3 = _mm_unpackhi_epi16(E3l, E7l); + + m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); + m128iS4 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS5 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); + m128iS6 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS7 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp0 = _mm_unpacklo_epi16(E0h, E4h); + m128Tmp1 = _mm_unpacklo_epi16(E1h, E5h); + m128Tmp2 = _mm_unpacklo_epi16(E2h, E6h); + m128Tmp3 = _mm_unpacklo_epi16(E3h, E7h); + + m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); + m128iS8 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS9 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); + m128iS10 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS11 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp0 = _mm_unpackhi_epi16(E0h, E4h); + m128Tmp1 = _mm_unpackhi_epi16(E1h, E5h); + m128Tmp2 = _mm_unpackhi_epi16(E2h, E6h); + m128Tmp3 = _mm_unpackhi_epi16(E3h, E7h); + + m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); + m128iS12 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS13 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); + m128iS14 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS15 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + if (!i) { + + r0= m128iS0; //0 + r1= m128iS1; //16 + r2= m128iS2; //32 + r3= m128iS3; //48 + r4= m128iS4; //64 + r5= m128iS5; //80 + r6= m128iS6; //96 + r7= m128iS7; //112 + r8= m128iS8; //128 + r9= m128iS9; //144 + r10= m128iS10; //160 + r11= m128iS11; //176 + r12= m128iS12; //192 + r13= m128iS13; //208 + r14= m128iS14; //224 + r15= m128iS15; //240 + + + + m128iS0 = _mm_load_si128((__m128i *) (src + 8)); + m128iS1 = _mm_load_si128((__m128i *) (src + 24)); + m128iS2 = _mm_load_si128((__m128i *) (src + 40)); + m128iS3 = _mm_load_si128((__m128i *) (src + 56)); + m128iS4 = _mm_loadu_si128((__m128i *) (src + 72)); + m128iS5 = _mm_load_si128((__m128i *) (src + 88)); + m128iS6 = _mm_load_si128((__m128i *) (src + 104)); + m128iS7 = _mm_load_si128((__m128i *) (src + 120)); + m128iS8 = _mm_load_si128((__m128i *) (src + 136)); + m128iS9 = _mm_load_si128((__m128i *) (src + 152)); + m128iS10 = _mm_load_si128((__m128i *) (src + 168)); + m128iS11 = _mm_load_si128((__m128i *) (src + 184)); + m128iS12 = _mm_load_si128((__m128i *) (src + 200)); + m128iS13 = _mm_load_si128((__m128i *) (src + 216)); + m128iS14 = _mm_load_si128((__m128i *) (src + 232)); + m128iS15 = _mm_load_si128((__m128i *) (src + 248)); + } else { + + r16= m128iS0; //8 + r17= m128iS1; //24 + r18= m128iS2; //40 + r19= m128iS3; //56 + r20= m128iS4; //72 + r21= m128iS5; //88 + r22= m128iS6; //104 + r23= m128iS7; //120 + r24= m128iS8; //136 + r25= m128iS9; //152 + r26= m128iS10; //168 + r27= m128iS11; //184 + r28= m128iS12; //200 + r29= m128iS13; //216 + r30= m128iS14; //232 + r31= m128iS15; //248 + + //prepare next iteration : + + m128iS0= r0; + m128iS1= r2; + m128iS2= r4; + m128iS3= r6; + m128iS4= r8; + m128iS5= r10; + m128iS6= r12; + m128iS7= r14; + m128iS8= r16; + m128iS9= r18; + m128iS10=r20; + m128iS11=r22; + m128iS12=r24; + m128iS13=r26; + m128iS14=r28; + m128iS15=r30; + + shift = shift_2nd; + m128iAdd = _mm_set1_epi32(add_2nd); + } + + } else { + + //transpose half matrix : + //instead of having 1 register = 1 half-column, + //1 register = 1 half-row. + E0l = _mm_unpacklo_epi16(m128iS0, m128iS1); + E1l = _mm_unpacklo_epi16(m128iS2, m128iS3); + E2l = _mm_unpacklo_epi16(m128iS4, m128iS5); + E3l = _mm_unpacklo_epi16(m128iS6, m128iS7); + E4l = _mm_unpacklo_epi16(m128iS8, m128iS9); + E5l = _mm_unpacklo_epi16(m128iS10, m128iS11); + E6l = _mm_unpacklo_epi16(m128iS12, m128iS13); + E7l = _mm_unpacklo_epi16(m128iS14, m128iS15); + + O0l = _mm_unpackhi_epi16(m128iS0, m128iS1); + O1l = _mm_unpackhi_epi16(m128iS2, m128iS3); + O2l = _mm_unpackhi_epi16(m128iS4, m128iS5); + O3l = _mm_unpackhi_epi16(m128iS6, m128iS7); + O4l = _mm_unpackhi_epi16(m128iS8, m128iS9); + O5l = _mm_unpackhi_epi16(m128iS10, m128iS11); + O6l = _mm_unpackhi_epi16(m128iS12, m128iS13); + O7l = _mm_unpackhi_epi16(m128iS14, m128iS15); + + + m128Tmp0 = _mm_unpacklo_epi32(E0l, E1l); + m128Tmp1 = _mm_unpacklo_epi32(E2l, E3l); + + m128Tmp2 = _mm_unpacklo_epi32(E4l, E5l); + m128Tmp3 = _mm_unpacklo_epi32(E6l, E7l); + + r0 = _mm_unpacklo_epi64(m128Tmp0, m128Tmp1); //1st half 1st row + r2 = _mm_unpacklo_epi64(m128Tmp2, m128Tmp3); //2nd half 1st row + + + r4 = _mm_unpackhi_epi64(m128Tmp0, m128Tmp1); //1st half 2nd row + r6 = _mm_unpackhi_epi64(m128Tmp2, m128Tmp3); //2nd hald 2nd row + + m128Tmp0 = _mm_unpackhi_epi32(E0l, E1l); + m128Tmp1 = _mm_unpackhi_epi32(E2l, E3l); + m128Tmp2 = _mm_unpackhi_epi32(E4l, E5l); + m128Tmp3 = _mm_unpackhi_epi32(E6l, E7l); + + + r8 = _mm_unpacklo_epi64(m128Tmp0, m128Tmp1); + r10 = _mm_unpacklo_epi64(m128Tmp2, m128Tmp3); + + r12 = _mm_unpackhi_epi64(m128Tmp0, m128Tmp1); + r14 = _mm_unpackhi_epi64(m128Tmp2, m128Tmp3); + + m128Tmp0 = _mm_unpacklo_epi32(O0l, O1l); + m128Tmp1 = _mm_unpacklo_epi32(O2l, O3l); + m128Tmp2 = _mm_unpacklo_epi32(O4l, O5l); + m128Tmp3 = _mm_unpacklo_epi32(O6l, O7l); + + r16 = _mm_unpacklo_epi64(m128Tmp0, m128Tmp1); + r18 = _mm_unpacklo_epi64(m128Tmp2, m128Tmp3); + + + r20 = _mm_unpackhi_epi64(m128Tmp0, m128Tmp1); + r22 = _mm_unpackhi_epi64(m128Tmp2, m128Tmp3); + + m128Tmp0 = _mm_unpackhi_epi32(O0l, O1l); + m128Tmp1 = _mm_unpackhi_epi32(O2l, O3l); + m128Tmp2 = _mm_unpackhi_epi32(O4l, O5l); + m128Tmp3 = _mm_unpackhi_epi32(O6l, O7l); + + r24 = _mm_unpacklo_epi64(m128Tmp0, m128Tmp1); + r26 = _mm_unpacklo_epi64(m128Tmp2, m128Tmp3); + + + r28 = _mm_unpackhi_epi64(m128Tmp0, m128Tmp1); + r30 = _mm_unpackhi_epi64(m128Tmp2, m128Tmp3); + + dst = (uint8_t*) (_dst + (i*stride)); + m128Tmp0= _mm_setzero_si128(); + m128Tmp1= _mm_load_si128((__m128i*)dst); + m128Tmp2= _mm_load_si128((__m128i*)(dst+stride)); + m128Tmp3= _mm_load_si128((__m128i*)(dst+2*stride)); + m128Tmp4= _mm_load_si128((__m128i*)(dst+3*stride)); + m128Tmp5= _mm_load_si128((__m128i*)(dst+4*stride)); + m128Tmp6= _mm_load_si128((__m128i*)(dst+5*stride)); + m128Tmp7= _mm_load_si128((__m128i*)(dst+6*stride)); + E0l= _mm_load_si128((__m128i*)(dst+7*stride)); + + + r0= _mm_adds_epi16(r0,_mm_unpacklo_epi8(m128Tmp1,m128Tmp0)); + r2= _mm_adds_epi16(r2,_mm_unpackhi_epi8(m128Tmp1,m128Tmp0)); + r0= _mm_packus_epi16(r0,r2); + + + + + r4= _mm_adds_epi16(r4,_mm_unpacklo_epi8(m128Tmp2,m128Tmp0)); + r6= _mm_adds_epi16(r6,_mm_unpackhi_epi8(m128Tmp2,m128Tmp0)); + r4= _mm_packus_epi16(r4,r6); + + + r8= _mm_adds_epi16(r8,_mm_unpacklo_epi8(m128Tmp3,m128Tmp0)); + r10= _mm_adds_epi16(r10,_mm_unpackhi_epi8(m128Tmp3,m128Tmp0)); + r8= _mm_packus_epi16(r8,r10); + + + r12= _mm_adds_epi16(r12,_mm_unpacklo_epi8(m128Tmp4,m128Tmp0)); + r14= _mm_adds_epi16(r14,_mm_unpackhi_epi8(m128Tmp4,m128Tmp0)); + r12= _mm_packus_epi16(r12,r14); + + + r16= _mm_adds_epi16(r16,_mm_unpacklo_epi8(m128Tmp5,m128Tmp0)); + r18= _mm_adds_epi16(r18,_mm_unpackhi_epi8(m128Tmp5,m128Tmp0)); + r16= _mm_packus_epi16(r16,r18); + + + r20= _mm_adds_epi16(r20,_mm_unpacklo_epi8(m128Tmp6,m128Tmp0)); + r22= _mm_adds_epi16(r22,_mm_unpackhi_epi8(m128Tmp6,m128Tmp0)); + r20= _mm_packus_epi16(r20,r22); + + + r24= _mm_adds_epi16(r24,_mm_unpacklo_epi8(m128Tmp7,m128Tmp0)); + r26= _mm_adds_epi16(r26,_mm_unpackhi_epi8(m128Tmp7,m128Tmp0)); + r24= _mm_packus_epi16(r24,r26); + + + + r28= _mm_adds_epi16(r28,_mm_unpacklo_epi8(E0l,m128Tmp0)); + r30= _mm_adds_epi16(r30,_mm_unpackhi_epi8(E0l,m128Tmp0)); + r28= _mm_packus_epi16(r28,r30); + + _mm_store_si128((__m128i*)dst,r0); + _mm_store_si128((__m128i*)(dst+stride),r4); + _mm_store_si128((__m128i*)(dst+2*stride),r8); + _mm_store_si128((__m128i*)(dst+3*stride),r12); + _mm_store_si128((__m128i*)(dst+4*stride),r16); + _mm_store_si128((__m128i*)(dst+5*stride),r20); + _mm_store_si128((__m128i*)(dst+6*stride),r24); + _mm_store_si128((__m128i*)(dst+7*stride),r28); + + + + if (!i) { + //first half done, can store ! + + + m128iS0= r1; + m128iS1= r3; + m128iS2= r5; + m128iS3= r7; + m128iS4= r9; + m128iS5= r11; + m128iS6= r13; + m128iS7= r15; + m128iS8= r17; + m128iS9= r19; + m128iS10=r21; + m128iS11=r23; + m128iS12=r25; + m128iS13=r27; + m128iS14=r29; + m128iS15=r31; + } + } + } + } +} +#endif + + +#if 0 +void ff_hevc_transform_16x16_add_10_sse4(uint8_t *_dst, const int16_t *coeffs, + ptrdiff_t _stride) { + int i; + uint16_t *dst = (uint16_t*) _dst; + ptrdiff_t stride = _stride / 2; + int16_t *src = coeffs; + int32_t shift; + uint8_t shift_2nd = 10; //20 - bit depth + uint16_t add_2nd = 1 << 9; //shift - 1; + __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6, + m128iS7, m128iS8, m128iS9, m128iS10, m128iS11, m128iS12, m128iS13, + m128iS14, m128iS15, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2, + m128Tmp3, m128Tmp4, m128Tmp5, m128Tmp6, m128Tmp7, E0h, E1h, E2h, + E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O4h, O5h, O6h, O7h, + O0l, O1l, O2l, O3l, O4l, O5l, O6l, O7l, EE0l, EE1l, EE2l, EE3l, + E00l, E01l, EE0h, EE1h, EE2h, EE3h, E00h, E01h; + __m128i E4l, E5l, E6l, E7l; + __m128i E4h, E5h, E6h, E7h; + int j; + m128iS0 = _mm_load_si128((__m128i *) (src)); + m128iS1 = _mm_load_si128((__m128i *) (src + 16)); + m128iS2 = _mm_load_si128((__m128i *) (src + 32)); + m128iS3 = _mm_load_si128((__m128i *) (src + 48)); + m128iS4 = _mm_loadu_si128((__m128i *) (src + 64)); + m128iS5 = _mm_load_si128((__m128i *) (src + 80)); + m128iS6 = _mm_load_si128((__m128i *) (src + 96)); + m128iS7 = _mm_load_si128((__m128i *) (src + 112)); + m128iS8 = _mm_load_si128((__m128i *) (src + 128)); + m128iS9 = _mm_load_si128((__m128i *) (src + 144)); + m128iS10 = _mm_load_si128((__m128i *) (src + 160)); + m128iS11 = _mm_load_si128((__m128i *) (src + 176)); + m128iS12 = _mm_loadu_si128((__m128i *) (src + 192)); + m128iS13 = _mm_load_si128((__m128i *) (src + 208)); + m128iS14 = _mm_load_si128((__m128i *) (src + 224)); + m128iS15 = _mm_load_si128((__m128i *) (src + 240)); + shift = shift_1st; + m128iAdd = _mm_set1_epi32(add_1st); + + for (j = 0; j < 2; j++) { + for (i = 0; i < 16; i += 8) { + + m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3); + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_1[0][0]))); + m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_1[0][0]))); + + m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform16x16_1[1][0]))); + m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform16x16_1[1][0]))); + + m128Tmp4 = _mm_unpacklo_epi16(m128iS9, m128iS11); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform16x16_1[2][0]))); + m128Tmp5 = _mm_unpackhi_epi16(m128iS9, m128iS11); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform16x16_1[2][0]))); + + m128Tmp6 = _mm_unpacklo_epi16(m128iS13, m128iS15); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform16x16_1[3][0]))); + m128Tmp7 = _mm_unpackhi_epi16(m128iS13, m128iS15); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform16x16_1[3][0]))); + + O0l = _mm_add_epi32(E0l, E1l); + O0l = _mm_add_epi32(O0l, E2l); + O0l = _mm_add_epi32(O0l, E3l); + + O0h = _mm_add_epi32(E0h, E1h); + O0h = _mm_add_epi32(O0h, E2h); + O0h = _mm_add_epi32(O0h, E3h); + + /* Compute O1*/ + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_1[0][1]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_1[0][1]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform16x16_1[1][1]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform16x16_1[1][1]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform16x16_1[2][1]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform16x16_1[2][1]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform16x16_1[3][1]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform16x16_1[3][1]))); + O1l = _mm_add_epi32(E0l, E1l); + O1l = _mm_add_epi32(O1l, E2l); + O1l = _mm_add_epi32(O1l, E3l); + O1h = _mm_add_epi32(E0h, E1h); + O1h = _mm_add_epi32(O1h, E2h); + O1h = _mm_add_epi32(O1h, E3h); + + /* Compute O2*/ + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_1[0][2]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_1[0][2]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform16x16_1[1][2]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform16x16_1[1][2]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform16x16_1[2][2]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform16x16_1[2][2]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform16x16_1[3][2]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform16x16_1[3][2]))); + O2l = _mm_add_epi32(E0l, E1l); + O2l = _mm_add_epi32(O2l, E2l); + O2l = _mm_add_epi32(O2l, E3l); + + O2h = _mm_add_epi32(E0h, E1h); + O2h = _mm_add_epi32(O2h, E2h); + O2h = _mm_add_epi32(O2h, E3h); + + /* Compute O3*/ + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_1[0][3]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_1[0][3]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform16x16_1[1][3]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform16x16_1[1][3]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform16x16_1[2][3]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform16x16_1[2][3]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform16x16_1[3][3]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform16x16_1[3][3]))); + + O3l = _mm_add_epi32(E0l, E1l); + O3l = _mm_add_epi32(O3l, E2l); + O3l = _mm_add_epi32(O3l, E3l); + + O3h = _mm_add_epi32(E0h, E1h); + O3h = _mm_add_epi32(O3h, E2h); + O3h = _mm_add_epi32(O3h, E3h); + + /* Compute O4*/ + + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_1[0][4]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_1[0][4]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform16x16_1[1][4]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform16x16_1[1][4]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform16x16_1[2][4]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform16x16_1[2][4]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform16x16_1[3][4]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform16x16_1[3][4]))); + + O4l = _mm_add_epi32(E0l, E1l); + O4l = _mm_add_epi32(O4l, E2l); + O4l = _mm_add_epi32(O4l, E3l); + + O4h = _mm_add_epi32(E0h, E1h); + O4h = _mm_add_epi32(O4h, E2h); + O4h = _mm_add_epi32(O4h, E3h); + + /* Compute O5*/ + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_1[0][5]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_1[0][5]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform16x16_1[1][5]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform16x16_1[1][5]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform16x16_1[2][5]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform16x16_1[2][5]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform16x16_1[3][5]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform16x16_1[3][5]))); + + O5l = _mm_add_epi32(E0l, E1l); + O5l = _mm_add_epi32(O5l, E2l); + O5l = _mm_add_epi32(O5l, E3l); + + O5h = _mm_add_epi32(E0h, E1h); + O5h = _mm_add_epi32(O5h, E2h); + O5h = _mm_add_epi32(O5h, E3h); + + /* Compute O6*/ + + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_1[0][6]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_1[0][6]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform16x16_1[1][6]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform16x16_1[1][6]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform16x16_1[2][6]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform16x16_1[2][6]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform16x16_1[3][6]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform16x16_1[3][6]))); + + O6l = _mm_add_epi32(E0l, E1l); + O6l = _mm_add_epi32(O6l, E2l); + O6l = _mm_add_epi32(O6l, E3l); + + O6h = _mm_add_epi32(E0h, E1h); + O6h = _mm_add_epi32(O6h, E2h); + O6h = _mm_add_epi32(O6h, E3h); + + /* Compute O7*/ + + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_1[0][7]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_1[0][7]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform16x16_1[1][7]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform16x16_1[1][7]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform16x16_1[2][7]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform16x16_1[2][7]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform16x16_1[3][7]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform16x16_1[3][7]))); + + O7l = _mm_add_epi32(E0l, E1l); + O7l = _mm_add_epi32(O7l, E2l); + O7l = _mm_add_epi32(O7l, E3l); + + O7h = _mm_add_epi32(E0h, E1h); + O7h = _mm_add_epi32(O7h, E2h); + O7h = _mm_add_epi32(O7h, E3h); + + /* Compute E0 */ + + m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6); + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_2[0][0]))); + m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_2[0][0]))); + + m128Tmp2 = _mm_unpacklo_epi16(m128iS10, m128iS14); + E0l = _mm_add_epi32(E0l, + _mm_madd_epi16(m128Tmp2, + _mm_load_si128( + (__m128i *) (transform16x16_2[1][0])))); + m128Tmp3 = _mm_unpackhi_epi16(m128iS10, m128iS14); + E0h = _mm_add_epi32(E0h, + _mm_madd_epi16(m128Tmp3, + _mm_load_si128( + (__m128i *) (transform16x16_2[1][0])))); + + /* Compute E1 */ + E1l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_2[0][1]))); + E1h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_2[0][1]))); + E1l = _mm_add_epi32(E1l, + _mm_madd_epi16(m128Tmp2, + _mm_load_si128( + (__m128i *) (transform16x16_2[1][1])))); + E1h = _mm_add_epi32(E1h, + _mm_madd_epi16(m128Tmp3, + _mm_load_si128( + (__m128i *) (transform16x16_2[1][1])))); + + /* Compute E2 */ + E2l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_2[0][2]))); + E2h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_2[0][2]))); + E2l = _mm_add_epi32(E2l, + _mm_madd_epi16(m128Tmp2, + _mm_load_si128( + (__m128i *) (transform16x16_2[1][2])))); + E2h = _mm_add_epi32(E2h, + _mm_madd_epi16(m128Tmp3, + _mm_load_si128( + (__m128i *) (transform16x16_2[1][2])))); + /* Compute E3 */ + E3l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_2[0][3]))); + E3h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_2[0][3]))); + E3l = _mm_add_epi32(E3l, + _mm_madd_epi16(m128Tmp2, + _mm_load_si128( + (__m128i *) (transform16x16_2[1][3])))); + E3h = _mm_add_epi32(E3h, + _mm_madd_epi16(m128Tmp3, + _mm_load_si128( + (__m128i *) (transform16x16_2[1][3])))); + + /* Compute EE0 and EEE */ + + m128Tmp0 = _mm_unpacklo_epi16(m128iS4, m128iS12); + E00l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_3[0][0]))); + m128Tmp1 = _mm_unpackhi_epi16(m128iS4, m128iS12); + E00h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_3[0][0]))); + + m128Tmp2 = _mm_unpacklo_epi16(m128iS0, m128iS8); + EE0l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform16x16_3[1][0]))); + m128Tmp3 = _mm_unpackhi_epi16(m128iS0, m128iS8); + EE0h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform16x16_3[1][0]))); + + E01l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_3[0][1]))); + E01h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_3[0][1]))); + + EE1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform16x16_3[1][1]))); + EE1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform16x16_3[1][1]))); + + /* Compute EE */ + EE2l = _mm_sub_epi32(EE1l, E01l); + EE3l = _mm_sub_epi32(EE0l, E00l); + EE2h = _mm_sub_epi32(EE1h, E01h); + EE3h = _mm_sub_epi32(EE0h, E00h); + + EE0l = _mm_add_epi32(EE0l, E00l); + EE1l = _mm_add_epi32(EE1l, E01l); + EE0h = _mm_add_epi32(EE0h, E00h); + EE1h = _mm_add_epi32(EE1h, E01h); + + /* Compute E */ + + E4l = _mm_sub_epi32(EE3l, E3l); + E4l = _mm_add_epi32(E4l, m128iAdd); + + E5l = _mm_sub_epi32(EE2l, E2l); + E5l = _mm_add_epi32(E5l, m128iAdd); + + E6l = _mm_sub_epi32(EE1l, E1l); + E6l = _mm_add_epi32(E6l, m128iAdd); + + E7l = _mm_sub_epi32(EE0l, E0l); + E7l = _mm_add_epi32(E7l, m128iAdd); + + E4h = _mm_sub_epi32(EE3h, E3h); + E4h = _mm_add_epi32(E4h, m128iAdd); + + E5h = _mm_sub_epi32(EE2h, E2h); + E5h = _mm_add_epi32(E5h, m128iAdd); + + E6h = _mm_sub_epi32(EE1h, E1h); + E6h = _mm_add_epi32(E6h, m128iAdd); + + E7h = _mm_sub_epi32(EE0h, E0h); + E7h = _mm_add_epi32(E7h, m128iAdd); + + E0l = _mm_add_epi32(EE0l, E0l); + E0l = _mm_add_epi32(E0l, m128iAdd); + + E1l = _mm_add_epi32(EE1l, E1l); + E1l = _mm_add_epi32(E1l, m128iAdd); + + E2l = _mm_add_epi32(EE2l, E2l); + E2l = _mm_add_epi32(E2l, m128iAdd); + + E3l = _mm_add_epi32(EE3l, E3l); + E3l = _mm_add_epi32(E3l, m128iAdd); + + E0h = _mm_add_epi32(EE0h, E0h); + E0h = _mm_add_epi32(E0h, m128iAdd); + + E1h = _mm_add_epi32(EE1h, E1h); + E1h = _mm_add_epi32(E1h, m128iAdd); + + E2h = _mm_add_epi32(EE2h, E2h); + E2h = _mm_add_epi32(E2h, m128iAdd); + + E3h = _mm_add_epi32(EE3h, E3h); + E3h = _mm_add_epi32(E3h, m128iAdd); + + m128iS0 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift), + _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift)); + m128iS1 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift), + _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift)); + m128iS2 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift), + _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift)); + m128iS3 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift), + _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift)); + + m128iS4 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E4l, O4l), shift), + _mm_srai_epi32(_mm_add_epi32(E4h, O4h), shift)); + m128iS5 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E5l, O5l), shift), + _mm_srai_epi32(_mm_add_epi32(E5h, O5h), shift)); + m128iS6 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E6l, O6l), shift), + _mm_srai_epi32(_mm_add_epi32(E6h, O6h), shift)); + m128iS7 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E7l, O7l), shift), + _mm_srai_epi32(_mm_add_epi32(E7h, O7h), shift)); + + m128iS15 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift), + _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift)); + m128iS14 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift), + _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift)); + m128iS13 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift), + _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift)); + m128iS12 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift), + _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift)); + + m128iS11 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E4l, O4l), shift), + _mm_srai_epi32(_mm_sub_epi32(E4h, O4h), shift)); + m128iS10 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E5l, O5l), shift), + _mm_srai_epi32(_mm_sub_epi32(E5h, O5h), shift)); + m128iS9 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E6l, O6l), shift), + _mm_srai_epi32(_mm_sub_epi32(E6h, O6h), shift)); + m128iS8 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E7l, O7l), shift), + _mm_srai_epi32(_mm_sub_epi32(E7h, O7h), shift)); + + if (!j) { + /* Inverse the matrix */ + E0l = _mm_unpacklo_epi16(m128iS0, m128iS8); + E1l = _mm_unpacklo_epi16(m128iS1, m128iS9); + E2l = _mm_unpacklo_epi16(m128iS2, m128iS10); + E3l = _mm_unpacklo_epi16(m128iS3, m128iS11); + E4l = _mm_unpacklo_epi16(m128iS4, m128iS12); + E5l = _mm_unpacklo_epi16(m128iS5, m128iS13); + E6l = _mm_unpacklo_epi16(m128iS6, m128iS14); + E7l = _mm_unpacklo_epi16(m128iS7, m128iS15); + + O0l = _mm_unpackhi_epi16(m128iS0, m128iS8); + O1l = _mm_unpackhi_epi16(m128iS1, m128iS9); + O2l = _mm_unpackhi_epi16(m128iS2, m128iS10); + O3l = _mm_unpackhi_epi16(m128iS3, m128iS11); + O4l = _mm_unpackhi_epi16(m128iS4, m128iS12); + O5l = _mm_unpackhi_epi16(m128iS5, m128iS13); + O6l = _mm_unpackhi_epi16(m128iS6, m128iS14); + O7l = _mm_unpackhi_epi16(m128iS7, m128iS15); + + m128Tmp0 = _mm_unpacklo_epi16(E0l, E4l); + m128Tmp1 = _mm_unpacklo_epi16(E1l, E5l); + m128Tmp2 = _mm_unpacklo_epi16(E2l, E6l); + m128Tmp3 = _mm_unpacklo_epi16(E3l, E7l); + + m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); + m128iS0 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS1 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); + m128iS2 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS3 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp0 = _mm_unpackhi_epi16(E0l, E4l); + m128Tmp1 = _mm_unpackhi_epi16(E1l, E5l); + m128Tmp2 = _mm_unpackhi_epi16(E2l, E6l); + m128Tmp3 = _mm_unpackhi_epi16(E3l, E7l); + + m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); + m128iS4 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS5 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); + m128iS6 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS7 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp0 = _mm_unpacklo_epi16(O0l, O4l); + m128Tmp1 = _mm_unpacklo_epi16(O1l, O5l); + m128Tmp2 = _mm_unpacklo_epi16(O2l, O6l); + m128Tmp3 = _mm_unpacklo_epi16(O3l, O7l); + + m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); + m128iS8 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS9 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); + m128iS10 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS11 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp0 = _mm_unpackhi_epi16(O0l, O4l); + m128Tmp1 = _mm_unpackhi_epi16(O1l, O5l); + m128Tmp2 = _mm_unpackhi_epi16(O2l, O6l); + m128Tmp3 = _mm_unpackhi_epi16(O3l, O7l); + + m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); + m128iS12 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS13 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); + m128iS14 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS15 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + /* */ + _mm_store_si128((__m128i *) (src + i), m128iS0); + _mm_store_si128((__m128i *) (src + 16 + i), m128iS1); + _mm_store_si128((__m128i *) (src + 32 + i), m128iS2); + _mm_store_si128((__m128i *) (src + 48 + i), m128iS3); + _mm_store_si128((__m128i *) (src + 64 + i), m128iS4); + _mm_store_si128((__m128i *) (src + 80 + i), m128iS5); + _mm_store_si128((__m128i *) (src + 96 + i), m128iS6); + _mm_store_si128((__m128i *) (src + 112 + i), m128iS7); + _mm_store_si128((__m128i *) (src + 128 + i), m128iS8); + _mm_store_si128((__m128i *) (src + 144 + i), m128iS9); + _mm_store_si128((__m128i *) (src + 160 + i), m128iS10); + _mm_store_si128((__m128i *) (src + 176 + i), m128iS11); + _mm_store_si128((__m128i *) (src + 192 + i), m128iS12); + _mm_store_si128((__m128i *) (src + 208 + i), m128iS13); + _mm_store_si128((__m128i *) (src + 224 + i), m128iS14); + _mm_store_si128((__m128i *) (src + 240 + i), m128iS15); + + if (!i) { + m128iS0 = _mm_load_si128((__m128i *) (src + 8)); + m128iS1 = _mm_load_si128((__m128i *) (src + 24)); + m128iS2 = _mm_load_si128((__m128i *) (src + 40)); + m128iS3 = _mm_load_si128((__m128i *) (src + 56)); + m128iS4 = _mm_loadu_si128((__m128i *) (src + 72)); + m128iS5 = _mm_load_si128((__m128i *) (src + 88)); + m128iS6 = _mm_load_si128((__m128i *) (src + 104)); + m128iS7 = _mm_load_si128((__m128i *) (src + 120)); + m128iS8 = _mm_load_si128((__m128i *) (src + 136)); + m128iS9 = _mm_load_si128((__m128i *) (src + 152)); + m128iS10 = _mm_load_si128((__m128i *) (src + 168)); + m128iS11 = _mm_load_si128((__m128i *) (src + 184)); + m128iS12 = _mm_loadu_si128((__m128i *) (src + 200)); + m128iS13 = _mm_load_si128((__m128i *) (src + 216)); + m128iS14 = _mm_load_si128((__m128i *) (src + 232)); + m128iS15 = _mm_load_si128((__m128i *) (src + 248)); + } else { + m128iS0 = _mm_load_si128((__m128i *) (src)); + m128iS1 = _mm_load_si128((__m128i *) (src + 32)); + m128iS2 = _mm_load_si128((__m128i *) (src + 64)); + m128iS3 = _mm_load_si128((__m128i *) (src + 96)); + m128iS4 = _mm_loadu_si128((__m128i *) (src + 128)); + m128iS5 = _mm_load_si128((__m128i *) (src + 160)); + m128iS6 = _mm_load_si128((__m128i *) (src + 192)); + m128iS7 = _mm_load_si128((__m128i *) (src + 224)); + m128iS8 = _mm_load_si128((__m128i *) (src + 8)); + m128iS9 = _mm_load_si128((__m128i *) (src + 32 + 8)); + m128iS10 = _mm_load_si128((__m128i *) (src + 64 + 8)); + m128iS11 = _mm_load_si128((__m128i *) (src + 96 + 8)); + m128iS12 = _mm_loadu_si128((__m128i *) (src + 128 + 8)); + m128iS13 = _mm_load_si128((__m128i *) (src + 160 + 8)); + m128iS14 = _mm_load_si128((__m128i *) (src + 192 + 8)); + m128iS15 = _mm_load_si128((__m128i *) (src + 224 + 8)); + shift = shift_2nd; + m128iAdd = _mm_set1_epi32(add_2nd); + } + + } else { + int k, m = 0; + _mm_storeu_si128((__m128i *) (src), m128iS0); + _mm_storeu_si128((__m128i *) (src + 8), m128iS1); + _mm_storeu_si128((__m128i *) (src + 32), m128iS2); + _mm_storeu_si128((__m128i *) (src + 40), m128iS3); + _mm_storeu_si128((__m128i *) (src + 64), m128iS4); + _mm_storeu_si128((__m128i *) (src + 72), m128iS5); + _mm_storeu_si128((__m128i *) (src + 96), m128iS6); + _mm_storeu_si128((__m128i *) (src + 104), m128iS7); + _mm_storeu_si128((__m128i *) (src + 128), m128iS8); + _mm_storeu_si128((__m128i *) (src + 136), m128iS9); + _mm_storeu_si128((__m128i *) (src + 160), m128iS10); + _mm_storeu_si128((__m128i *) (src + 168), m128iS11); + _mm_storeu_si128((__m128i *) (src + 192), m128iS12); + _mm_storeu_si128((__m128i *) (src + 200), m128iS13); + _mm_storeu_si128((__m128i *) (src + 224), m128iS14); + _mm_storeu_si128((__m128i *) (src + 232), m128iS15); + dst = (uint16_t*) _dst + (i * stride); + + for (k = 0; k < 8; k++) { + dst[0] = av_clip_uintp2(dst[0] + src[m],10); + dst[1] = av_clip_uintp2(dst[1] + src[m + 8],10); + dst[2] = av_clip_uintp2(dst[2] + src[m + 32],10); + dst[3] = av_clip_uintp2(dst[3] + src[m + 40],10); + dst[4] = av_clip_uintp2(dst[4] + src[m + 64],10); + dst[5] = av_clip_uintp2(dst[5] + src[m + 72],10); + dst[6] = av_clip_uintp2(dst[6] + src[m + 96],10); + dst[7] = av_clip_uintp2(dst[7] + src[m + 104],10); + + dst[8] = av_clip_uintp2(dst[8] + src[m + 128],10); + dst[9] = av_clip_uintp2(dst[9] + src[m + 136],10); + dst[10] = av_clip_uintp2(dst[10] + src[m + 160],10); + dst[11] = av_clip_uintp2(dst[11] + src[m + 168],10); + dst[12] = av_clip_uintp2(dst[12] + src[m + 192],10); + dst[13] = av_clip_uintp2(dst[13] + src[m + 200],10); + dst[14] = av_clip_uintp2(dst[14] + src[m + 224],10); + dst[15] = av_clip_uintp2(dst[15] + src[m + 232],10); + m += 1; + dst += stride; + } + if (!i) { + m128iS0 = _mm_load_si128((__m128i *) (src + 16)); + m128iS1 = _mm_load_si128((__m128i *) (src + 48)); + m128iS2 = _mm_load_si128((__m128i *) (src + 80)); + m128iS3 = _mm_loadu_si128((__m128i *) (src + 112)); + m128iS4 = _mm_load_si128((__m128i *) (src + 144)); + m128iS5 = _mm_load_si128((__m128i *) (src + 176)); + m128iS6 = _mm_load_si128((__m128i *) (src + 208)); + m128iS7 = _mm_load_si128((__m128i *) (src + 240)); + m128iS8 = _mm_load_si128((__m128i *) (src + 24)); + m128iS9 = _mm_load_si128((__m128i *) (src + 56)); + m128iS10 = _mm_load_si128((__m128i *) (src + 88)); + m128iS11 = _mm_loadu_si128((__m128i *) (src + 120)); + m128iS12 = _mm_load_si128((__m128i *) (src + 152)); + m128iS13 = _mm_load_si128((__m128i *) (src + 184)); + m128iS14 = _mm_load_si128((__m128i *) (src + 216)); + m128iS15 = _mm_load_si128((__m128i *) (src + 248)); + } + } + } + } + +} +#endif + + +#if HAVE_SSE4_1 +void ff_hevc_transform_32x32_add_8_sse4(uint8_t *_dst, const int16_t *coeffs, + ptrdiff_t _stride) { + uint8_t shift_2nd = 12; // 20 - Bit depth + uint16_t add_2nd = 1 << 11; //(1 << (shift_2nd - 1)) + int i, j; + uint8_t *dst = (uint8_t*) _dst; + ptrdiff_t stride = _stride / sizeof(uint8_t); + int shift; + const int16_t *src = coeffs; + + __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6, + m128iS7, m128iS8, m128iS9, m128iS10, m128iS11, m128iS12, m128iS13, + m128iS14, m128iS15, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2, + m128Tmp3, m128Tmp4, m128Tmp5, m128Tmp6, m128Tmp7, E0h, E1h, E2h, + E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O4h, O5h, O6h, O7h, + O0l, O1l, O2l, O3l, O4l, O5l, O6l, O7l, EE0l, EE1l, EE2l, EE3l, + E00l, E01l, EE0h, EE1h, EE2h, EE3h, E00h, E01h; + __m128i E4l, E5l, E6l, E7l, E8l, E9l, E10l, E11l, E12l, E13l, E14l, E15l; + __m128i E4h, E5h, E6h, E7h, E8h, E9h, E10h, E11h, E12h, E13h, E14h, E15h, + EEE0l, EEE1l, EEE0h, EEE1h; + __m128i m128iS16, m128iS17, m128iS18, m128iS19, m128iS20, m128iS21, + m128iS22, m128iS23, m128iS24, m128iS25, m128iS26, m128iS27, + m128iS28, m128iS29, m128iS30, m128iS31, m128Tmp8, m128Tmp9, + m128Tmp10, m128Tmp11, m128Tmp12, m128Tmp13, m128Tmp14, m128Tmp15, + O8h, O9h, O10h, O11h, O12h, O13h, O14h, O15h, O8l, O9l, O10l, O11l, + O12l, O13l, O14l, O15l, E02l, E02h, E03l, E03h, EE7l, EE6l, EE5l, + EE4l, EE7h, EE6h, EE5h, EE4h; + + __m128i r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,r13,r14,r15,r16,r17,r18,r19,r20,r21,r22,r23,r24,r25,r26,r27,r28,r29,r30,r31; + __m128i r32,r33,r34,r35,r36,r37,r38,r39,r40,r41,r42,r43,r44,r45,r46,r47,r48,r49,r50,r51,r52,r53,r54,r55,r56,r57,r58,r59,r60,r61,r62,r63; + __m128i r64,r65,r66,r67,r68,r69,r70,r71,r72,r73,r74,r75,r76,r77,r78,r79,r80,r81,r82,r83,r84,r85,r86,r87,r88,r89,r90,r91,r92,r93,r94,r95; + __m128i r96,r97,r98,r99,r100,r101,r102,r103,r104,r105,r106,r107,r108,r109,r110,r111,r112,r113,r114,r115,r116,r117,r118,r119,r120,r121,r122,r123,r124,r125,r126,r127; + + + m128iS0 = _mm_load_si128((__m128i *) (src)); + m128iS1 = _mm_load_si128((__m128i *) (src + 32)); + m128iS2 = _mm_load_si128((__m128i *) (src + 64)); + m128iS3 = _mm_load_si128((__m128i *) (src + 96)); + m128iS4 = _mm_loadu_si128((__m128i *) (src + 128)); + m128iS5 = _mm_load_si128((__m128i *) (src + 160)); + m128iS6 = _mm_load_si128((__m128i *) (src + 192)); + m128iS7 = _mm_load_si128((__m128i *) (src + 224)); + m128iS8 = _mm_load_si128((__m128i *) (src + 256)); + m128iS9 = _mm_load_si128((__m128i *) (src + 288)); + m128iS10 = _mm_load_si128((__m128i *) (src + 320)); + m128iS11 = _mm_load_si128((__m128i *) (src + 352)); + m128iS12 = _mm_load_si128((__m128i *) (src + 384)); + m128iS13 = _mm_load_si128((__m128i *) (src + 416)); + m128iS14 = _mm_load_si128((__m128i *) (src + 448)); + m128iS15 = _mm_load_si128((__m128i *) (src + 480)); + m128iS16 = _mm_load_si128((__m128i *) (src + 512)); + m128iS17 = _mm_load_si128((__m128i *) (src + 544)); + m128iS18 = _mm_load_si128((__m128i *) (src + 576)); + m128iS19 = _mm_load_si128((__m128i *) (src + 608)); + m128iS20 = _mm_load_si128((__m128i *) (src + 640)); + m128iS21 = _mm_load_si128((__m128i *) (src + 672)); + m128iS22 = _mm_load_si128((__m128i *) (src + 704)); + m128iS23 = _mm_load_si128((__m128i *) (src + 736)); + m128iS24 = _mm_load_si128((__m128i *) (src + 768)); + m128iS25 = _mm_load_si128((__m128i *) (src + 800)); + m128iS26 = _mm_load_si128((__m128i *) (src + 832)); + m128iS27 = _mm_load_si128((__m128i *) (src + 864)); + m128iS28 = _mm_load_si128((__m128i *) (src + 896)); + m128iS29 = _mm_load_si128((__m128i *) (src + 928)); + m128iS30 = _mm_load_si128((__m128i *) (src + 960)); + m128iS31 = _mm_load_si128((__m128i *) (src + 992)); + + shift = shift_1st; + m128iAdd = _mm_set1_epi32(add_1st); + + for (j = 0; j < 2; j++) { + for (i = 0; i < 32; i += 8) { + m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3); + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][0]))); + m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][0]))); + + m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][0]))); + m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][0]))); + + m128Tmp4 = _mm_unpacklo_epi16(m128iS9, m128iS11); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][0]))); + m128Tmp5 = _mm_unpackhi_epi16(m128iS9, m128iS11); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][0]))); + + m128Tmp6 = _mm_unpacklo_epi16(m128iS13, m128iS15); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][0]))); + m128Tmp7 = _mm_unpackhi_epi16(m128iS13, m128iS15); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][0]))); + + m128Tmp8 = _mm_unpacklo_epi16(m128iS17, m128iS19); + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][0]))); + m128Tmp9 = _mm_unpackhi_epi16(m128iS17, m128iS19); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][0]))); + + m128Tmp10 = _mm_unpacklo_epi16(m128iS21, m128iS23); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][0]))); + m128Tmp11 = _mm_unpackhi_epi16(m128iS21, m128iS23); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][0]))); + + m128Tmp12 = _mm_unpacklo_epi16(m128iS25, m128iS27); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][0]))); + m128Tmp13 = _mm_unpackhi_epi16(m128iS25, m128iS27); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][0]))); + + m128Tmp14 = _mm_unpacklo_epi16(m128iS29, m128iS31); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][0]))); + m128Tmp15 = _mm_unpackhi_epi16(m128iS29, m128iS31); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][0]))); + + O0l = _mm_add_epi32(E0l, E1l); + O0l = _mm_add_epi32(O0l, E2l); + O0l = _mm_add_epi32(O0l, E3l); + O0l = _mm_add_epi32(O0l, E4l); + O0l = _mm_add_epi32(O0l, E5l); + O0l = _mm_add_epi32(O0l, E6l); + O0l = _mm_add_epi32(O0l, E7l); + + O0h = _mm_add_epi32(E0h, E1h); + O0h = _mm_add_epi32(O0h, E2h); + O0h = _mm_add_epi32(O0h, E3h); + O0h = _mm_add_epi32(O0h, E4h); + O0h = _mm_add_epi32(O0h, E5h); + O0h = _mm_add_epi32(O0h, E6h); + O0h = _mm_add_epi32(O0h, E7h); + + /* Compute O1*/ + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][1]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][1]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][1]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][1]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][1]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][1]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][1]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][1]))); + + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][1]))); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][1]))); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][1]))); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][1]))); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][1]))); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][1]))); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][1]))); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][1]))); + + O1l = _mm_add_epi32(E0l, E1l); + O1l = _mm_add_epi32(O1l, E2l); + O1l = _mm_add_epi32(O1l, E3l); + O1l = _mm_add_epi32(O1l, E4l); + O1l = _mm_add_epi32(O1l, E5l); + O1l = _mm_add_epi32(O1l, E6l); + O1l = _mm_add_epi32(O1l, E7l); + + O1h = _mm_add_epi32(E0h, E1h); + O1h = _mm_add_epi32(O1h, E2h); + O1h = _mm_add_epi32(O1h, E3h); + O1h = _mm_add_epi32(O1h, E4h); + O1h = _mm_add_epi32(O1h, E5h); + O1h = _mm_add_epi32(O1h, E6h); + O1h = _mm_add_epi32(O1h, E7h); + /* Compute O2*/ + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][2]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][2]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][2]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][2]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][2]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][2]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][2]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][2]))); + + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][2]))); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][2]))); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][2]))); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][2]))); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][2]))); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][2]))); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][2]))); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][2]))); + + O2l = _mm_add_epi32(E0l, E1l); + O2l = _mm_add_epi32(O2l, E2l); + O2l = _mm_add_epi32(O2l, E3l); + O2l = _mm_add_epi32(O2l, E4l); + O2l = _mm_add_epi32(O2l, E5l); + O2l = _mm_add_epi32(O2l, E6l); + O2l = _mm_add_epi32(O2l, E7l); + + O2h = _mm_add_epi32(E0h, E1h); + O2h = _mm_add_epi32(O2h, E2h); + O2h = _mm_add_epi32(O2h, E3h); + O2h = _mm_add_epi32(O2h, E4h); + O2h = _mm_add_epi32(O2h, E5h); + O2h = _mm_add_epi32(O2h, E6h); + O2h = _mm_add_epi32(O2h, E7h); + /* Compute O3*/ + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][3]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][3]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][3]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][3]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][3]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][3]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][3]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][3]))); + + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][3]))); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][3]))); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][3]))); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][3]))); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][3]))); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][3]))); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][3]))); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][3]))); + + O3l = _mm_add_epi32(E0l, E1l); + O3l = _mm_add_epi32(O3l, E2l); + O3l = _mm_add_epi32(O3l, E3l); + O3l = _mm_add_epi32(O3l, E4l); + O3l = _mm_add_epi32(O3l, E5l); + O3l = _mm_add_epi32(O3l, E6l); + O3l = _mm_add_epi32(O3l, E7l); + + O3h = _mm_add_epi32(E0h, E1h); + O3h = _mm_add_epi32(O3h, E2h); + O3h = _mm_add_epi32(O3h, E3h); + O3h = _mm_add_epi32(O3h, E4h); + O3h = _mm_add_epi32(O3h, E5h); + O3h = _mm_add_epi32(O3h, E6h); + O3h = _mm_add_epi32(O3h, E7h); + /* Compute O4*/ + + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][4]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][4]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][4]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][4]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][4]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][4]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][4]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][4]))); + + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][4]))); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][4]))); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][4]))); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][4]))); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][4]))); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][4]))); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][4]))); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][4]))); + + O4l = _mm_add_epi32(E0l, E1l); + O4l = _mm_add_epi32(O4l, E2l); + O4l = _mm_add_epi32(O4l, E3l); + O4l = _mm_add_epi32(O4l, E4l); + O4l = _mm_add_epi32(O4l, E5l); + O4l = _mm_add_epi32(O4l, E6l); + O4l = _mm_add_epi32(O4l, E7l); + + O4h = _mm_add_epi32(E0h, E1h); + O4h = _mm_add_epi32(O4h, E2h); + O4h = _mm_add_epi32(O4h, E3h); + O4h = _mm_add_epi32(O4h, E4h); + O4h = _mm_add_epi32(O4h, E5h); + O4h = _mm_add_epi32(O4h, E6h); + O4h = _mm_add_epi32(O4h, E7h); + + /* Compute O5*/ + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][5]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][5]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][5]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][5]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][5]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][5]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][5]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][5]))); + + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][5]))); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][5]))); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][5]))); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][5]))); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][5]))); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][5]))); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][5]))); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][5]))); + + O5l = _mm_add_epi32(E0l, E1l); + O5l = _mm_add_epi32(O5l, E2l); + O5l = _mm_add_epi32(O5l, E3l); + O5l = _mm_add_epi32(O5l, E4l); + O5l = _mm_add_epi32(O5l, E5l); + O5l = _mm_add_epi32(O5l, E6l); + O5l = _mm_add_epi32(O5l, E7l); + + O5h = _mm_add_epi32(E0h, E1h); + O5h = _mm_add_epi32(O5h, E2h); + O5h = _mm_add_epi32(O5h, E3h); + O5h = _mm_add_epi32(O5h, E4h); + O5h = _mm_add_epi32(O5h, E5h); + O5h = _mm_add_epi32(O5h, E6h); + O5h = _mm_add_epi32(O5h, E7h); + + /* Compute O6*/ + + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][6]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][6]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][6]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][6]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][6]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][6]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][6]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][6]))); + + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][6]))); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][6]))); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][6]))); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][6]))); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][6]))); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][6]))); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][6]))); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][6]))); + + O6l = _mm_add_epi32(E0l, E1l); + O6l = _mm_add_epi32(O6l, E2l); + O6l = _mm_add_epi32(O6l, E3l); + O6l = _mm_add_epi32(O6l, E4l); + O6l = _mm_add_epi32(O6l, E5l); + O6l = _mm_add_epi32(O6l, E6l); + O6l = _mm_add_epi32(O6l, E7l); + + O6h = _mm_add_epi32(E0h, E1h); + O6h = _mm_add_epi32(O6h, E2h); + O6h = _mm_add_epi32(O6h, E3h); + O6h = _mm_add_epi32(O6h, E4h); + O6h = _mm_add_epi32(O6h, E5h); + O6h = _mm_add_epi32(O6h, E6h); + O6h = _mm_add_epi32(O6h, E7h); + + /* Compute O7*/ + + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][7]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][7]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][7]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][7]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][7]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][7]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][7]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][7]))); + + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][7]))); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][7]))); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][7]))); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][7]))); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][7]))); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][7]))); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][7]))); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][7]))); + + O7l = _mm_add_epi32(E0l, E1l); + O7l = _mm_add_epi32(O7l, E2l); + O7l = _mm_add_epi32(O7l, E3l); + O7l = _mm_add_epi32(O7l, E4l); + O7l = _mm_add_epi32(O7l, E5l); + O7l = _mm_add_epi32(O7l, E6l); + O7l = _mm_add_epi32(O7l, E7l); + + O7h = _mm_add_epi32(E0h, E1h); + O7h = _mm_add_epi32(O7h, E2h); + O7h = _mm_add_epi32(O7h, E3h); + O7h = _mm_add_epi32(O7h, E4h); + O7h = _mm_add_epi32(O7h, E5h); + O7h = _mm_add_epi32(O7h, E6h); + O7h = _mm_add_epi32(O7h, E7h); + + /* Compute O8*/ + + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][8]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][8]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][8]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][8]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][8]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][8]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][8]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][8]))); + + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][8]))); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][8]))); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][8]))); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][8]))); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][8]))); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][8]))); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][8]))); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][8]))); + + O8l = _mm_add_epi32(E0l, E1l); + O8l = _mm_add_epi32(O8l, E2l); + O8l = _mm_add_epi32(O8l, E3l); + O8l = _mm_add_epi32(O8l, E4l); + O8l = _mm_add_epi32(O8l, E5l); + O8l = _mm_add_epi32(O8l, E6l); + O8l = _mm_add_epi32(O8l, E7l); + + O8h = _mm_add_epi32(E0h, E1h); + O8h = _mm_add_epi32(O8h, E2h); + O8h = _mm_add_epi32(O8h, E3h); + O8h = _mm_add_epi32(O8h, E4h); + O8h = _mm_add_epi32(O8h, E5h); + O8h = _mm_add_epi32(O8h, E6h); + O8h = _mm_add_epi32(O8h, E7h); + + /* Compute O9*/ + + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][9]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][9]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][9]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][9]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][9]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][9]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][9]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][9]))); + + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][9]))); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][9]))); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][9]))); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][9]))); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][9]))); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][9]))); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][9]))); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][9]))); + + O9l = _mm_add_epi32(E0l, E1l); + O9l = _mm_add_epi32(O9l, E2l); + O9l = _mm_add_epi32(O9l, E3l); + O9l = _mm_add_epi32(O9l, E4l); + O9l = _mm_add_epi32(O9l, E5l); + O9l = _mm_add_epi32(O9l, E6l); + O9l = _mm_add_epi32(O9l, E7l); + + O9h = _mm_add_epi32(E0h, E1h); + O9h = _mm_add_epi32(O9h, E2h); + O9h = _mm_add_epi32(O9h, E3h); + O9h = _mm_add_epi32(O9h, E4h); + O9h = _mm_add_epi32(O9h, E5h); + O9h = _mm_add_epi32(O9h, E6h); + O9h = _mm_add_epi32(O9h, E7h); + + /* Compute 10*/ + + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][10]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][10]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][10]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][10]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][10]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][10]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][10]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][10]))); + + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][10]))); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][10]))); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][10]))); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][10]))); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][10]))); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][10]))); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][10]))); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][10]))); + + O10l = _mm_add_epi32(E0l, E1l); + O10l = _mm_add_epi32(O10l, E2l); + O10l = _mm_add_epi32(O10l, E3l); + O10l = _mm_add_epi32(O10l, E4l); + O10l = _mm_add_epi32(O10l, E5l); + O10l = _mm_add_epi32(O10l, E6l); + O10l = _mm_add_epi32(O10l, E7l); + + O10h = _mm_add_epi32(E0h, E1h); + O10h = _mm_add_epi32(O10h, E2h); + O10h = _mm_add_epi32(O10h, E3h); + O10h = _mm_add_epi32(O10h, E4h); + O10h = _mm_add_epi32(O10h, E5h); + O10h = _mm_add_epi32(O10h, E6h); + O10h = _mm_add_epi32(O10h, E7h); + + /* Compute 11*/ + + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][11]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][11]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][11]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][11]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][11]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][11]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][11]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][11]))); + + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][11]))); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][11]))); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][11]))); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][11]))); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][11]))); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][11]))); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][11]))); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][11]))); + + O11l = _mm_add_epi32(E0l, E1l); + O11l = _mm_add_epi32(O11l, E2l); + O11l = _mm_add_epi32(O11l, E3l); + O11l = _mm_add_epi32(O11l, E4l); + O11l = _mm_add_epi32(O11l, E5l); + O11l = _mm_add_epi32(O11l, E6l); + O11l = _mm_add_epi32(O11l, E7l); + + O11h = _mm_add_epi32(E0h, E1h); + O11h = _mm_add_epi32(O11h, E2h); + O11h = _mm_add_epi32(O11h, E3h); + O11h = _mm_add_epi32(O11h, E4h); + O11h = _mm_add_epi32(O11h, E5h); + O11h = _mm_add_epi32(O11h, E6h); + O11h = _mm_add_epi32(O11h, E7h); + + /* Compute 12*/ + + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][12]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][12]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][12]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][12]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][12]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][12]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][12]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][12]))); + + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][12]))); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][12]))); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][12]))); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][12]))); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][12]))); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][12]))); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][12]))); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][12]))); + + O12l = _mm_add_epi32(E0l, E1l); + O12l = _mm_add_epi32(O12l, E2l); + O12l = _mm_add_epi32(O12l, E3l); + O12l = _mm_add_epi32(O12l, E4l); + O12l = _mm_add_epi32(O12l, E5l); + O12l = _mm_add_epi32(O12l, E6l); + O12l = _mm_add_epi32(O12l, E7l); + + O12h = _mm_add_epi32(E0h, E1h); + O12h = _mm_add_epi32(O12h, E2h); + O12h = _mm_add_epi32(O12h, E3h); + O12h = _mm_add_epi32(O12h, E4h); + O12h = _mm_add_epi32(O12h, E5h); + O12h = _mm_add_epi32(O12h, E6h); + O12h = _mm_add_epi32(O12h, E7h); + + /* Compute 13*/ + + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][13]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][13]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][13]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][13]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][13]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][13]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][13]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][13]))); + + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][13]))); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][13]))); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][13]))); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][13]))); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][13]))); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][13]))); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][13]))); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][13]))); + + O13l = _mm_add_epi32(E0l, E1l); + O13l = _mm_add_epi32(O13l, E2l); + O13l = _mm_add_epi32(O13l, E3l); + O13l = _mm_add_epi32(O13l, E4l); + O13l = _mm_add_epi32(O13l, E5l); + O13l = _mm_add_epi32(O13l, E6l); + O13l = _mm_add_epi32(O13l, E7l); + + O13h = _mm_add_epi32(E0h, E1h); + O13h = _mm_add_epi32(O13h, E2h); + O13h = _mm_add_epi32(O13h, E3h); + O13h = _mm_add_epi32(O13h, E4h); + O13h = _mm_add_epi32(O13h, E5h); + O13h = _mm_add_epi32(O13h, E6h); + O13h = _mm_add_epi32(O13h, E7h); + + /* Compute O14 */ + + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][14]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][14]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][14]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][14]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][14]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][14]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][14]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][14]))); + + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][14]))); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][14]))); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][14]))); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][14]))); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][14]))); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][14]))); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][14]))); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][14]))); + + O14l = _mm_add_epi32(E0l, E1l); + O14l = _mm_add_epi32(O14l, E2l); + O14l = _mm_add_epi32(O14l, E3l); + O14l = _mm_add_epi32(O14l, E4l); + O14l = _mm_add_epi32(O14l, E5l); + O14l = _mm_add_epi32(O14l, E6l); + O14l = _mm_add_epi32(O14l, E7l); + + O14h = _mm_add_epi32(E0h, E1h); + O14h = _mm_add_epi32(O14h, E2h); + O14h = _mm_add_epi32(O14h, E3h); + O14h = _mm_add_epi32(O14h, E4h); + O14h = _mm_add_epi32(O14h, E5h); + O14h = _mm_add_epi32(O14h, E6h); + O14h = _mm_add_epi32(O14h, E7h); + + /* Compute O15*/ + + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][15]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][15]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][15]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][15]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][15]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][15]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][15]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][15]))); + + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][15]))); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][15]))); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][15]))); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][15]))); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][15]))); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][15]))); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][15]))); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][15]))); + + O15l = _mm_add_epi32(E0l, E1l); + O15l = _mm_add_epi32(O15l, E2l); + O15l = _mm_add_epi32(O15l, E3l); + O15l = _mm_add_epi32(O15l, E4l); + O15l = _mm_add_epi32(O15l, E5l); + O15l = _mm_add_epi32(O15l, E6l); + O15l = _mm_add_epi32(O15l, E7l); + + O15h = _mm_add_epi32(E0h, E1h); + O15h = _mm_add_epi32(O15h, E2h); + O15h = _mm_add_epi32(O15h, E3h); + O15h = _mm_add_epi32(O15h, E4h); + O15h = _mm_add_epi32(O15h, E5h); + O15h = _mm_add_epi32(O15h, E6h); + O15h = _mm_add_epi32(O15h, E7h); + /* Compute E0 */ + + m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6); + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_1[0][0]))); + m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_1[0][0]))); + + m128Tmp2 = _mm_unpacklo_epi16(m128iS10, m128iS14); + E0l = _mm_add_epi32(E0l, + _mm_madd_epi16(m128Tmp2, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][0])))); + m128Tmp3 = _mm_unpackhi_epi16(m128iS10, m128iS14); + E0h = _mm_add_epi32(E0h, + _mm_madd_epi16(m128Tmp3, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][0])))); + + m128Tmp4 = _mm_unpacklo_epi16(m128iS18, m128iS22); + E0l = _mm_add_epi32(E0l, + _mm_madd_epi16(m128Tmp4, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][0])))); + m128Tmp5 = _mm_unpackhi_epi16(m128iS18, m128iS22); + E0h = _mm_add_epi32(E0h, + _mm_madd_epi16(m128Tmp5, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][0])))); + + m128Tmp6 = _mm_unpacklo_epi16(m128iS26, m128iS30); + E0l = _mm_add_epi32(E0l, + _mm_madd_epi16(m128Tmp6, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][0])))); + m128Tmp7 = _mm_unpackhi_epi16(m128iS26, m128iS30); + E0h = _mm_add_epi32(E0h, + _mm_madd_epi16(m128Tmp7, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][0])))); + + /* Compute E1 */ + E1l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_1[0][1]))); + E1h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_1[0][1]))); + E1l = _mm_add_epi32(E1l, + _mm_madd_epi16(m128Tmp2, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][1])))); + E1h = _mm_add_epi32(E1h, + _mm_madd_epi16(m128Tmp3, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][1])))); + E1l = _mm_add_epi32(E1l, + _mm_madd_epi16(m128Tmp4, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][1])))); + E1h = _mm_add_epi32(E1h, + _mm_madd_epi16(m128Tmp5, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][1])))); + E1l = _mm_add_epi32(E1l, + _mm_madd_epi16(m128Tmp6, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][1])))); + E1h = _mm_add_epi32(E1h, + _mm_madd_epi16(m128Tmp7, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][1])))); + + /* Compute E2 */ + E2l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_1[0][2]))); + E2h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_1[0][2]))); + E2l = _mm_add_epi32(E2l, + _mm_madd_epi16(m128Tmp2, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][2])))); + E2h = _mm_add_epi32(E2h, + _mm_madd_epi16(m128Tmp3, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][2])))); + E2l = _mm_add_epi32(E2l, + _mm_madd_epi16(m128Tmp4, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][2])))); + E2h = _mm_add_epi32(E2h, + _mm_madd_epi16(m128Tmp5, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][2])))); + E2l = _mm_add_epi32(E2l, + _mm_madd_epi16(m128Tmp6, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][2])))); + E2h = _mm_add_epi32(E2h, + _mm_madd_epi16(m128Tmp7, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][2])))); + + /* Compute E3 */ + E3l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_1[0][3]))); + E3h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_1[0][3]))); + E3l = _mm_add_epi32(E3l, + _mm_madd_epi16(m128Tmp2, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][3])))); + E3h = _mm_add_epi32(E3h, + _mm_madd_epi16(m128Tmp3, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][3])))); + E3l = _mm_add_epi32(E3l, + _mm_madd_epi16(m128Tmp4, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][3])))); + E3h = _mm_add_epi32(E3h, + _mm_madd_epi16(m128Tmp5, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][3])))); + E3l = _mm_add_epi32(E3l, + _mm_madd_epi16(m128Tmp6, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][3])))); + E3h = _mm_add_epi32(E3h, + _mm_madd_epi16(m128Tmp7, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][3])))); + + /* Compute E4 */ + E4l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_1[0][4]))); + E4h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_1[0][4]))); + E4l = _mm_add_epi32(E4l, + _mm_madd_epi16(m128Tmp2, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][4])))); + E4h = _mm_add_epi32(E4h, + _mm_madd_epi16(m128Tmp3, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][4])))); + E4l = _mm_add_epi32(E4l, + _mm_madd_epi16(m128Tmp4, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][4])))); + E4h = _mm_add_epi32(E4h, + _mm_madd_epi16(m128Tmp5, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][4])))); + E4l = _mm_add_epi32(E4l, + _mm_madd_epi16(m128Tmp6, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][4])))); + E4h = _mm_add_epi32(E4h, + _mm_madd_epi16(m128Tmp7, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][4])))); + + /* Compute E3 */ + E5l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_1[0][5]))); + E5h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_1[0][5]))); + E5l = _mm_add_epi32(E5l, + _mm_madd_epi16(m128Tmp2, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][5])))); + E5h = _mm_add_epi32(E5h, + _mm_madd_epi16(m128Tmp3, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][5])))); + E5l = _mm_add_epi32(E5l, + _mm_madd_epi16(m128Tmp4, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][5])))); + E5h = _mm_add_epi32(E5h, + _mm_madd_epi16(m128Tmp5, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][5])))); + E5l = _mm_add_epi32(E5l, + _mm_madd_epi16(m128Tmp6, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][5])))); + E5h = _mm_add_epi32(E5h, + _mm_madd_epi16(m128Tmp7, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][5])))); + + /* Compute E6 */ + E6l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_1[0][6]))); + E6h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_1[0][6]))); + E6l = _mm_add_epi32(E6l, + _mm_madd_epi16(m128Tmp2, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][6])))); + E6h = _mm_add_epi32(E6h, + _mm_madd_epi16(m128Tmp3, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][6])))); + E6l = _mm_add_epi32(E6l, + _mm_madd_epi16(m128Tmp4, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][6])))); + E6h = _mm_add_epi32(E6h, + _mm_madd_epi16(m128Tmp5, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][6])))); + E6l = _mm_add_epi32(E6l, + _mm_madd_epi16(m128Tmp6, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][6])))); + E6h = _mm_add_epi32(E6h, + _mm_madd_epi16(m128Tmp7, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][6])))); + + /* Compute E7 */ + E7l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_1[0][7]))); + E7h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_1[0][7]))); + E7l = _mm_add_epi32(E7l, + _mm_madd_epi16(m128Tmp2, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][7])))); + E7h = _mm_add_epi32(E7h, + _mm_madd_epi16(m128Tmp3, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][7])))); + E7l = _mm_add_epi32(E7l, + _mm_madd_epi16(m128Tmp4, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][7])))); + E7h = _mm_add_epi32(E7h, + _mm_madd_epi16(m128Tmp5, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][7])))); + E7l = _mm_add_epi32(E7l, + _mm_madd_epi16(m128Tmp6, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][7])))); + E7h = _mm_add_epi32(E7h, + _mm_madd_epi16(m128Tmp7, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][7])))); + + /* Compute EE0 and EEE */ + + m128Tmp0 = _mm_unpacklo_epi16(m128iS4, m128iS12); + E00l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_2[0][0]))); + m128Tmp1 = _mm_unpackhi_epi16(m128iS4, m128iS12); + E00h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_2[0][0]))); + + m128Tmp2 = _mm_unpacklo_epi16(m128iS20, m128iS28); + E00l = _mm_add_epi32(E00l, + _mm_madd_epi16(m128Tmp2, + _mm_load_si128( + (__m128i *) (transform16x16_2[1][0])))); + m128Tmp3 = _mm_unpackhi_epi16(m128iS20, m128iS28); + E00h = _mm_add_epi32(E00h, + _mm_madd_epi16(m128Tmp3, + _mm_load_si128( + (__m128i *) (transform16x16_2[1][0])))); + + E01l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_2[0][1]))); + E01h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_2[0][1]))); + E01l = _mm_add_epi32(E01l, + _mm_madd_epi16(m128Tmp2, + _mm_load_si128( + (__m128i *) (transform16x16_2[1][1])))); + E01h = _mm_add_epi32(E01h, + _mm_madd_epi16(m128Tmp3, + _mm_load_si128( + (__m128i *) (transform16x16_2[1][1])))); + + E02l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_2[0][2]))); + E02h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_2[0][2]))); + E02l = _mm_add_epi32(E02l, + _mm_madd_epi16(m128Tmp2, + _mm_load_si128( + (__m128i *) (transform16x16_2[1][2])))); + E02h = _mm_add_epi32(E02h, + _mm_madd_epi16(m128Tmp3, + _mm_load_si128( + (__m128i *) (transform16x16_2[1][2])))); + + E03l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_2[0][3]))); + E03h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_2[0][3]))); + E03l = _mm_add_epi32(E03l, + _mm_madd_epi16(m128Tmp2, + _mm_load_si128( + (__m128i *) (transform16x16_2[1][3])))); + E03h = _mm_add_epi32(E03h, + _mm_madd_epi16(m128Tmp3, + _mm_load_si128( + (__m128i *) (transform16x16_2[1][3])))); + + /* Compute EE0 and EEE */ + + m128Tmp0 = _mm_unpacklo_epi16(m128iS8, m128iS24); + EE0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_3[0][0]))); + m128Tmp1 = _mm_unpackhi_epi16(m128iS8, m128iS24); + EE0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_3[0][0]))); + + m128Tmp2 = _mm_unpacklo_epi16(m128iS0, m128iS16); + EEE0l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform16x16_3[1][0]))); + m128Tmp3 = _mm_unpackhi_epi16(m128iS0, m128iS16); + EEE0h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform16x16_3[1][0]))); + + EE1l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_3[0][1]))); + EE1h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_3[0][1]))); + + EEE1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform16x16_3[1][1]))); + EEE1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform16x16_3[1][1]))); + + /* Compute EE */ + + EE2l = _mm_sub_epi32(EEE1l, EE1l); + EE3l = _mm_sub_epi32(EEE0l, EE0l); + EE2h = _mm_sub_epi32(EEE1h, EE1h); + EE3h = _mm_sub_epi32(EEE0h, EE0h); + + EE0l = _mm_add_epi32(EEE0l, EE0l); + EE1l = _mm_add_epi32(EEE1l, EE1l); + EE0h = _mm_add_epi32(EEE0h, EE0h); + EE1h = _mm_add_epi32(EEE1h, EE1h); + /**/ + + EE7l = _mm_sub_epi32(EE0l, E00l); + EE6l = _mm_sub_epi32(EE1l, E01l); + EE5l = _mm_sub_epi32(EE2l, E02l); + EE4l = _mm_sub_epi32(EE3l, E03l); + + EE7h = _mm_sub_epi32(EE0h, E00h); + EE6h = _mm_sub_epi32(EE1h, E01h); + EE5h = _mm_sub_epi32(EE2h, E02h); + EE4h = _mm_sub_epi32(EE3h, E03h); + + EE0l = _mm_add_epi32(EE0l, E00l); + EE1l = _mm_add_epi32(EE1l, E01l); + EE2l = _mm_add_epi32(EE2l, E02l); + EE3l = _mm_add_epi32(EE3l, E03l); + + EE0h = _mm_add_epi32(EE0h, E00h); + EE1h = _mm_add_epi32(EE1h, E01h); + EE2h = _mm_add_epi32(EE2h, E02h); + EE3h = _mm_add_epi32(EE3h, E03h); + /* Compute E */ + + E15l = _mm_sub_epi32(EE0l, E0l); + E15l = _mm_add_epi32(E15l, m128iAdd); + E14l = _mm_sub_epi32(EE1l, E1l); + E14l = _mm_add_epi32(E14l, m128iAdd); + E13l = _mm_sub_epi32(EE2l, E2l); + E13l = _mm_add_epi32(E13l, m128iAdd); + E12l = _mm_sub_epi32(EE3l, E3l); + E12l = _mm_add_epi32(E12l, m128iAdd); + E11l = _mm_sub_epi32(EE4l, E4l); + E11l = _mm_add_epi32(E11l, m128iAdd); + E10l = _mm_sub_epi32(EE5l, E5l); + E10l = _mm_add_epi32(E10l, m128iAdd); + E9l = _mm_sub_epi32(EE6l, E6l); + E9l = _mm_add_epi32(E9l, m128iAdd); + E8l = _mm_sub_epi32(EE7l, E7l); + E8l = _mm_add_epi32(E8l, m128iAdd); + + E0l = _mm_add_epi32(EE0l, E0l); + E0l = _mm_add_epi32(E0l, m128iAdd); + E1l = _mm_add_epi32(EE1l, E1l); + E1l = _mm_add_epi32(E1l, m128iAdd); + E2l = _mm_add_epi32(EE2l, E2l); + E2l = _mm_add_epi32(E2l, m128iAdd); + E3l = _mm_add_epi32(EE3l, E3l); + E3l = _mm_add_epi32(E3l, m128iAdd); + E4l = _mm_add_epi32(EE4l, E4l); + E4l = _mm_add_epi32(E4l, m128iAdd); + E5l = _mm_add_epi32(EE5l, E5l); + E5l = _mm_add_epi32(E5l, m128iAdd); + E6l = _mm_add_epi32(EE6l, E6l); + E6l = _mm_add_epi32(E6l, m128iAdd); + E7l = _mm_add_epi32(EE7l, E7l); + E7l = _mm_add_epi32(E7l, m128iAdd); + + E15h = _mm_sub_epi32(EE0h, E0h); + E15h = _mm_add_epi32(E15h, m128iAdd); + E14h = _mm_sub_epi32(EE1h, E1h); + E14h = _mm_add_epi32(E14h, m128iAdd); + E13h = _mm_sub_epi32(EE2h, E2h); + E13h = _mm_add_epi32(E13h, m128iAdd); + E12h = _mm_sub_epi32(EE3h, E3h); + E12h = _mm_add_epi32(E12h, m128iAdd); + E11h = _mm_sub_epi32(EE4h, E4h); + E11h = _mm_add_epi32(E11h, m128iAdd); + E10h = _mm_sub_epi32(EE5h, E5h); + E10h = _mm_add_epi32(E10h, m128iAdd); + E9h = _mm_sub_epi32(EE6h, E6h); + E9h = _mm_add_epi32(E9h, m128iAdd); + E8h = _mm_sub_epi32(EE7h, E7h); + E8h = _mm_add_epi32(E8h, m128iAdd); + + E0h = _mm_add_epi32(EE0h, E0h); + E0h = _mm_add_epi32(E0h, m128iAdd); + E1h = _mm_add_epi32(EE1h, E1h); + E1h = _mm_add_epi32(E1h, m128iAdd); + E2h = _mm_add_epi32(EE2h, E2h); + E2h = _mm_add_epi32(E2h, m128iAdd); + E3h = _mm_add_epi32(EE3h, E3h); + E3h = _mm_add_epi32(E3h, m128iAdd); + E4h = _mm_add_epi32(EE4h, E4h); + E4h = _mm_add_epi32(E4h, m128iAdd); + E5h = _mm_add_epi32(EE5h, E5h); + E5h = _mm_add_epi32(E5h, m128iAdd); + E6h = _mm_add_epi32(EE6h, E6h); + E6h = _mm_add_epi32(E6h, m128iAdd); + E7h = _mm_add_epi32(EE7h, E7h); + E7h = _mm_add_epi32(E7h, m128iAdd); + + m128iS0 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift), + _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift)); + m128iS1 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift), + _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift)); + m128iS2 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift), + _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift)); + m128iS3 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift), + _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift)); + m128iS4 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E4l, O4l), shift), + _mm_srai_epi32(_mm_add_epi32(E4h, O4h), shift)); + m128iS5 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E5l, O5l), shift), + _mm_srai_epi32(_mm_add_epi32(E5h, O5h), shift)); + m128iS6 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E6l, O6l), shift), + _mm_srai_epi32(_mm_add_epi32(E6h, O6h), shift)); + m128iS7 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E7l, O7l), shift), + _mm_srai_epi32(_mm_add_epi32(E7h, O7h), shift)); + m128iS8 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E8l, O8l), shift), + _mm_srai_epi32(_mm_add_epi32(E8h, O8h), shift)); + m128iS9 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E9l, O9l), shift), + _mm_srai_epi32(_mm_add_epi32(E9h, O9h), shift)); + m128iS10 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E10l, O10l), shift), + _mm_srai_epi32(_mm_add_epi32(E10h, O10h), shift)); + m128iS11 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E11l, O11l), shift), + _mm_srai_epi32(_mm_add_epi32(E11h, O11h), shift)); + m128iS12 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E12l, O12l), shift), + _mm_srai_epi32(_mm_add_epi32(E12h, O12h), shift)); + m128iS13 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E13l, O13l), shift), + _mm_srai_epi32(_mm_add_epi32(E13h, O13h), shift)); + m128iS14 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E14l, O14l), shift), + _mm_srai_epi32(_mm_add_epi32(E14h, O14h), shift)); + m128iS15 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E15l, O15l), shift), + _mm_srai_epi32(_mm_add_epi32(E15h, O15h), shift)); + + m128iS31 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift), + _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift)); + m128iS30 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift), + _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift)); + m128iS29 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift), + _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift)); + m128iS28 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift), + _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift)); + m128iS27 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E4l, O4l), shift), + _mm_srai_epi32(_mm_sub_epi32(E4h, O4h), shift)); + m128iS26 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E5l, O5l), shift), + _mm_srai_epi32(_mm_sub_epi32(E5h, O5h), shift)); + m128iS25 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E6l, O6l), shift), + _mm_srai_epi32(_mm_sub_epi32(E6h, O6h), shift)); + m128iS24 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E7l, O7l), shift), + _mm_srai_epi32(_mm_sub_epi32(E7h, O7h), shift)); + m128iS23 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E8l, O8l), shift), + _mm_srai_epi32(_mm_sub_epi32(E8h, O8h), shift)); + m128iS22 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E9l, O9l), shift), + _mm_srai_epi32(_mm_sub_epi32(E9h, O9h), shift)); + m128iS21 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E10l, O10l), shift), + _mm_srai_epi32(_mm_sub_epi32(E10h, O10h), shift)); + m128iS20 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E11l, O11l), shift), + _mm_srai_epi32(_mm_sub_epi32(E11h, O11h), shift)); + m128iS19 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E12l, O12l), shift), + _mm_srai_epi32(_mm_sub_epi32(E12h, O12h), shift)); + m128iS18 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E13l, O13l), shift), + _mm_srai_epi32(_mm_sub_epi32(E13h, O13h), shift)); + m128iS17 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E14l, O14l), shift), + _mm_srai_epi32(_mm_sub_epi32(E14h, O14h), shift)); + m128iS16 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E15l, O15l), shift), + _mm_srai_epi32(_mm_sub_epi32(E15h, O15h), shift)); + + if (!j) { + /* Inverse the matrix */ + E0l = _mm_unpacklo_epi16(m128iS0, m128iS16); + E1l = _mm_unpacklo_epi16(m128iS1, m128iS17); + E2l = _mm_unpacklo_epi16(m128iS2, m128iS18); + E3l = _mm_unpacklo_epi16(m128iS3, m128iS19); + E4l = _mm_unpacklo_epi16(m128iS4, m128iS20); + E5l = _mm_unpacklo_epi16(m128iS5, m128iS21); + E6l = _mm_unpacklo_epi16(m128iS6, m128iS22); + E7l = _mm_unpacklo_epi16(m128iS7, m128iS23); + E8l = _mm_unpacklo_epi16(m128iS8, m128iS24); + E9l = _mm_unpacklo_epi16(m128iS9, m128iS25); + E10l = _mm_unpacklo_epi16(m128iS10, m128iS26); + E11l = _mm_unpacklo_epi16(m128iS11, m128iS27); + E12l = _mm_unpacklo_epi16(m128iS12, m128iS28); + E13l = _mm_unpacklo_epi16(m128iS13, m128iS29); + E14l = _mm_unpacklo_epi16(m128iS14, m128iS30); + E15l = _mm_unpacklo_epi16(m128iS15, m128iS31); + + O0l = _mm_unpackhi_epi16(m128iS0, m128iS16); + O1l = _mm_unpackhi_epi16(m128iS1, m128iS17); + O2l = _mm_unpackhi_epi16(m128iS2, m128iS18); + O3l = _mm_unpackhi_epi16(m128iS3, m128iS19); + O4l = _mm_unpackhi_epi16(m128iS4, m128iS20); + O5l = _mm_unpackhi_epi16(m128iS5, m128iS21); + O6l = _mm_unpackhi_epi16(m128iS6, m128iS22); + O7l = _mm_unpackhi_epi16(m128iS7, m128iS23); + O8l = _mm_unpackhi_epi16(m128iS8, m128iS24); + O9l = _mm_unpackhi_epi16(m128iS9, m128iS25); + O10l = _mm_unpackhi_epi16(m128iS10, m128iS26); + O11l = _mm_unpackhi_epi16(m128iS11, m128iS27); + O12l = _mm_unpackhi_epi16(m128iS12, m128iS28); + O13l = _mm_unpackhi_epi16(m128iS13, m128iS29); + O14l = _mm_unpackhi_epi16(m128iS14, m128iS30); + O15l = _mm_unpackhi_epi16(m128iS15, m128iS31); + + E0h = _mm_unpacklo_epi16(E0l, E8l); + E1h = _mm_unpacklo_epi16(E1l, E9l); + E2h = _mm_unpacklo_epi16(E2l, E10l); + E3h = _mm_unpacklo_epi16(E3l, E11l); + E4h = _mm_unpacklo_epi16(E4l, E12l); + E5h = _mm_unpacklo_epi16(E5l, E13l); + E6h = _mm_unpacklo_epi16(E6l, E14l); + E7h = _mm_unpacklo_epi16(E7l, E15l); + + E8h = _mm_unpackhi_epi16(E0l, E8l); + E9h = _mm_unpackhi_epi16(E1l, E9l); + E10h = _mm_unpackhi_epi16(E2l, E10l); + E11h = _mm_unpackhi_epi16(E3l, E11l); + E12h = _mm_unpackhi_epi16(E4l, E12l); + E13h = _mm_unpackhi_epi16(E5l, E13l); + E14h = _mm_unpackhi_epi16(E6l, E14l); + E15h = _mm_unpackhi_epi16(E7l, E15l); + + m128Tmp0 = _mm_unpacklo_epi16(E0h, E4h); + m128Tmp1 = _mm_unpacklo_epi16(E1h, E5h); + m128Tmp2 = _mm_unpacklo_epi16(E2h, E6h); + m128Tmp3 = _mm_unpacklo_epi16(E3h, E7h); + + m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); + m128iS0 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS1 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); + m128iS2 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS3 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp0 = _mm_unpackhi_epi16(E0h, E4h); + m128Tmp1 = _mm_unpackhi_epi16(E1h, E5h); + m128Tmp2 = _mm_unpackhi_epi16(E2h, E6h); + m128Tmp3 = _mm_unpackhi_epi16(E3h, E7h); + + m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); + m128iS4 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS5 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); + m128iS6 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS7 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp0 = _mm_unpacklo_epi16(E8h, E12h); + m128Tmp1 = _mm_unpacklo_epi16(E9h, E13h); + m128Tmp2 = _mm_unpacklo_epi16(E10h, E14h); + m128Tmp3 = _mm_unpacklo_epi16(E11h, E15h); + + m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); + m128iS8 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS9 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); + m128iS10 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS11 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp0 = _mm_unpackhi_epi16(E8h, E12h); + m128Tmp1 = _mm_unpackhi_epi16(E9h, E13h); + m128Tmp2 = _mm_unpackhi_epi16(E10h, E14h); + m128Tmp3 = _mm_unpackhi_epi16(E11h, E15h); + + m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); + m128iS12 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS13 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); + m128iS14 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS15 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + /* */ + E0h = _mm_unpacklo_epi16(O0l, O8l); + E1h = _mm_unpacklo_epi16(O1l, O9l); + E2h = _mm_unpacklo_epi16(O2l, O10l); + E3h = _mm_unpacklo_epi16(O3l, O11l); + E4h = _mm_unpacklo_epi16(O4l, O12l); + E5h = _mm_unpacklo_epi16(O5l, O13l); + E6h = _mm_unpacklo_epi16(O6l, O14l); + E7h = _mm_unpacklo_epi16(O7l, O15l); + + E8h = _mm_unpackhi_epi16(O0l, O8l); + E9h = _mm_unpackhi_epi16(O1l, O9l); + E10h = _mm_unpackhi_epi16(O2l, O10l); + E11h = _mm_unpackhi_epi16(O3l, O11l); + E12h = _mm_unpackhi_epi16(O4l, O12l); + E13h = _mm_unpackhi_epi16(O5l, O13l); + E14h = _mm_unpackhi_epi16(O6l, O14l); + E15h = _mm_unpackhi_epi16(O7l, O15l); + + m128Tmp0 = _mm_unpacklo_epi16(E0h, E4h); + m128Tmp1 = _mm_unpacklo_epi16(E1h, E5h); + m128Tmp2 = _mm_unpacklo_epi16(E2h, E6h); + m128Tmp3 = _mm_unpacklo_epi16(E3h, E7h); + + m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); + m128iS16 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS17 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); + m128iS18 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS19 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp0 = _mm_unpackhi_epi16(E0h, E4h); + m128Tmp1 = _mm_unpackhi_epi16(E1h, E5h); + m128Tmp2 = _mm_unpackhi_epi16(E2h, E6h); + m128Tmp3 = _mm_unpackhi_epi16(E3h, E7h); + + m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); + m128iS20 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS21 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); + m128iS22 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS23 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp0 = _mm_unpacklo_epi16(E8h, E12h); + m128Tmp1 = _mm_unpacklo_epi16(E9h, E13h); + m128Tmp2 = _mm_unpacklo_epi16(E10h, E14h); + m128Tmp3 = _mm_unpacklo_epi16(E11h, E15h); + + m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); + m128iS24 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS25 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); + m128iS26 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS27 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp0 = _mm_unpackhi_epi16(E8h, E12h); + m128Tmp1 = _mm_unpackhi_epi16(E9h, E13h); + m128Tmp2 = _mm_unpackhi_epi16(E10h, E14h); + m128Tmp3 = _mm_unpackhi_epi16(E11h, E15h); + + m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); + m128iS28 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS29 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); + m128iS30 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS31 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + if(i==0){ + int k = 8; + r0=m128iS0; + r1=m128iS1; + r2=m128iS2; + r3=m128iS3; + r4=m128iS4; + r5=m128iS5; + r6=m128iS6; + r7=m128iS7; + r8=m128iS8; + r9=m128iS9; + r10=m128iS10; + r11=m128iS11; + r12=m128iS12; + r13=m128iS13; + r14=m128iS14; + r15=m128iS15; + r16=m128iS16; + r17=m128iS17; + r18=m128iS18; + r19=m128iS19; + r20=m128iS20; + r21=m128iS21; + r22=m128iS22; + r23=m128iS23; + r24=m128iS24; + r25=m128iS25; + r26=m128iS26; + r27=m128iS27; + r28=m128iS28; + r29=m128iS29; + r30=m128iS30; + r31=m128iS31; + m128iS0 = _mm_load_si128((__m128i *) (src + k)); + m128iS1 = _mm_load_si128((__m128i *) (src + 32 + k)); + m128iS2 = _mm_load_si128((__m128i *) (src + 64 + k)); + m128iS3 = _mm_load_si128((__m128i *) (src + 96 + k)); + m128iS4 = _mm_load_si128((__m128i *) (src + 128 + k)); + m128iS5 = _mm_load_si128((__m128i *) (src + 160 + k)); + m128iS6 = _mm_load_si128((__m128i *) (src + 192 + k)); + m128iS7 = _mm_load_si128((__m128i *) (src + 224 + k)); + m128iS8 = _mm_load_si128((__m128i *) (src + 256 + k)); + m128iS9 = _mm_load_si128((__m128i *) (src + 288 + k)); + m128iS10 = _mm_load_si128((__m128i *) (src + 320 + k)); + m128iS11 = _mm_load_si128((__m128i *) (src + 352 + k)); + m128iS12 = _mm_load_si128((__m128i *) (src + 384 + k)); + m128iS13 = _mm_load_si128((__m128i *) (src + 416 + k)); + m128iS14 = _mm_load_si128((__m128i *) (src + 448 + k)); + m128iS15 = _mm_load_si128((__m128i *) (src + 480 + k)); + + m128iS16 = _mm_load_si128((__m128i *) (src + 512 + k)); + m128iS17 = _mm_load_si128((__m128i *) (src + 544 + k)); + m128iS18 = _mm_load_si128((__m128i *) (src + 576 + k)); + m128iS19 = _mm_load_si128((__m128i *) (src + 608 + k)); + m128iS20 = _mm_load_si128((__m128i *) (src + 640 + k)); + m128iS21 = _mm_load_si128((__m128i *) (src + 672 + k)); + m128iS22 = _mm_load_si128((__m128i *) (src + 704 + k)); + m128iS23 = _mm_load_si128((__m128i *) (src + 736 + k)); + m128iS24 = _mm_load_si128((__m128i *) (src + 768 + k)); + m128iS25 = _mm_load_si128((__m128i *) (src + 800 + k)); + m128iS26 = _mm_load_si128((__m128i *) (src + 832 + k)); + m128iS27 = _mm_load_si128((__m128i *) (src + 864 + k)); + m128iS28 = _mm_load_si128((__m128i *) (src + 896 + k)); + m128iS29 = _mm_load_si128((__m128i *) (src + 928 + k)); + m128iS30 = _mm_load_si128((__m128i *) (src + 960 + k)); + m128iS31 = _mm_load_si128((__m128i *) (src + 992 + k)); + + }else if(i ==8){ + + r32=m128iS0; + r33=m128iS1; + r34=m128iS2; + r35=m128iS3; + r36=m128iS4; + r37=m128iS5; + r38=m128iS6; + r39=m128iS7; + r40=m128iS8; + r41=m128iS9; + r42=m128iS10; + r43=m128iS11; + r44=m128iS12; + r45=m128iS13; + r46=m128iS14; + r47=m128iS15; + r48=m128iS16; + r49=m128iS17; + r50=m128iS18; + r51=m128iS19; + r52=m128iS20; + r53=m128iS21; + r54=m128iS22; + r55=m128iS23; + r56=m128iS24; + r57=m128iS25; + r58=m128iS26; + r59=m128iS27; + r60=m128iS28; + r61=m128iS29; + r62=m128iS30; + r63=m128iS31; + + m128iS0 = _mm_load_si128((__m128i *) (src + 16)); + m128iS1 = _mm_load_si128((__m128i *) (src + 48)); + m128iS2 = _mm_load_si128((__m128i *) (src + 80)); + m128iS3 = _mm_load_si128((__m128i *) (src + 112)); + m128iS4 = _mm_load_si128((__m128i *) (src + 144)); + m128iS5 = _mm_load_si128((__m128i *) (src + 176)); + m128iS6 = _mm_load_si128((__m128i *) (src + 192 + 16)); + m128iS7 = _mm_load_si128((__m128i *) (src + 224 + 16)); + m128iS8 = _mm_load_si128((__m128i *) (src + 256 + 16)); + m128iS9 = _mm_load_si128((__m128i *) (src + 288 + 16)); + m128iS10 = _mm_load_si128((__m128i *) (src + 320 + 16)); + m128iS11 = _mm_load_si128((__m128i *) (src + 352 + 16)); + m128iS12 = _mm_load_si128((__m128i *) (src + 384 + 16)); + m128iS13 = _mm_load_si128((__m128i *) (src + 416 + 16)); + m128iS14 = _mm_load_si128((__m128i *) (src + 448 + 16)); + m128iS15 = _mm_load_si128((__m128i *) (src + 480 + 16)); + + m128iS16 = _mm_load_si128((__m128i *) (src + 512 + 16)); + m128iS17 = _mm_load_si128((__m128i *) (src + 544 + 16)); + m128iS18 = _mm_load_si128((__m128i *) (src + 576 + 16)); + m128iS19 = _mm_load_si128((__m128i *) (src + 608 + 16)); + m128iS20 = _mm_load_si128((__m128i *) (src + 640 + 16)); + m128iS21 = _mm_load_si128((__m128i *) (src + 672 + 16)); + m128iS22 = _mm_load_si128((__m128i *) (src + 704 + 16)); + m128iS23 = _mm_load_si128((__m128i *) (src + 736 + 16)); + m128iS24 = _mm_load_si128((__m128i *) (src + 768 + 16)); + m128iS25 = _mm_load_si128((__m128i *) (src + 800 + 16)); + m128iS26 = _mm_load_si128((__m128i *) (src + 832 + 16)); + m128iS27 = _mm_load_si128((__m128i *) (src + 864 + 16)); + m128iS28 = _mm_load_si128((__m128i *) (src + 896 + 16)); + m128iS29 = _mm_load_si128((__m128i *) (src + 928 + 16)); + m128iS30 = _mm_load_si128((__m128i *) (src + 960 + 16)); + m128iS31 = _mm_load_si128((__m128i *) (src + 992 + 16)); + + + }else if(i ==16){ + + r64=m128iS0; + r65=m128iS1; + r66=m128iS2; + r67=m128iS3; + r68=m128iS4; + r69=m128iS5; + r70=m128iS6; + r71=m128iS7; + r72=m128iS8; + r73=m128iS9; + r74=m128iS10; + r75=m128iS11; + r76=m128iS12; + r77=m128iS13; + r78=m128iS14; + r79=m128iS15; + r80=m128iS16; + r81=m128iS17; + r82=m128iS18; + r83=m128iS19; + r84=m128iS20; + r85=m128iS21; + r86=m128iS22; + r87=m128iS23; + r88=m128iS24; + r89=m128iS25; + r90=m128iS26; + r91=m128iS27; + r92=m128iS28; + r93=m128iS29; + r94=m128iS30; + r95=m128iS31; + + m128iS0 = _mm_load_si128((__m128i *) (src + 24)); + m128iS1 = _mm_load_si128((__m128i *) (src + 56)); + m128iS2 = _mm_load_si128((__m128i *) (src + 64 + 24)); + m128iS3 = _mm_load_si128((__m128i *) (src + 96 + 24)); + m128iS4 = _mm_load_si128((__m128i *) (src + 128 + 24)); + m128iS5 = _mm_load_si128((__m128i *) (src + 160 + 24)); + m128iS6 = _mm_load_si128((__m128i *) (src + 192 + 24)); + m128iS7 = _mm_load_si128((__m128i *) (src + 224 + 24)); + m128iS8 = _mm_load_si128((__m128i *) (src + 256 + 24)); + m128iS9 = _mm_load_si128((__m128i *) (src + 288 + 24)); + m128iS10 = _mm_load_si128((__m128i *) (src + 320 + 24)); + m128iS11 = _mm_load_si128((__m128i *) (src + 352 + 24)); + m128iS12 = _mm_load_si128((__m128i *) (src + 384 + 24)); + m128iS13 = _mm_load_si128((__m128i *) (src + 416 + 24)); + m128iS14 = _mm_load_si128((__m128i *) (src + 448 + 24)); + m128iS15 = _mm_load_si128((__m128i *) (src + 480 + 24)); + + m128iS16 = _mm_load_si128((__m128i *) (src + 512 + 24)); + m128iS17 = _mm_load_si128((__m128i *) (src + 544 + 24)); + m128iS18 = _mm_load_si128((__m128i *) (src + 576 + 24)); + m128iS19 = _mm_load_si128((__m128i *) (src + 608 + 24)); + m128iS20 = _mm_load_si128((__m128i *) (src + 640 + 24)); + m128iS21 = _mm_load_si128((__m128i *) (src + 672 + 24)); + m128iS22 = _mm_load_si128((__m128i *) (src + 704 + 24)); + m128iS23 = _mm_load_si128((__m128i *) (src + 736 + 24)); + m128iS24 = _mm_load_si128((__m128i *) (src + 768 + 24)); + m128iS25 = _mm_load_si128((__m128i *) (src + 800 + 24)); + m128iS26 = _mm_load_si128((__m128i *) (src + 832 + 24)); + m128iS27 = _mm_load_si128((__m128i *) (src + 864 + 24)); + m128iS28 = _mm_load_si128((__m128i *) (src + 896 + 24)); + m128iS29 = _mm_load_si128((__m128i *) (src + 928 + 24)); + m128iS30 = _mm_load_si128((__m128i *) (src + 960 + 24)); + m128iS31 = _mm_load_si128((__m128i *) (src + 992 + 24)); + + }else{ + r96=m128iS0; + r97=m128iS1; + r98=m128iS2; + r99=m128iS3; + r100=m128iS4; + r101=m128iS5; + r102=m128iS6; + r103=m128iS7; + r104=m128iS8; + r105=m128iS9; + r106=m128iS10; + r107=m128iS11; + r108=m128iS12; + r109=m128iS13; + r110=m128iS14; + r111=m128iS15; + r112=m128iS16; + r113=m128iS17; + r114=m128iS18; + r115=m128iS19; + r116=m128iS20; + r117=m128iS21; + r118=m128iS22; + r119=m128iS23; + r120=m128iS24; + r121=m128iS25; + r122=m128iS26; + r123=m128iS27; + r124=m128iS28; + r125=m128iS29; + r126=m128iS30; + r127=m128iS31; + + //load data for next j : + m128iS0 = r0; + m128iS1 = r4; + m128iS2 = r8; + m128iS3 = r12; + m128iS4 = r16; + m128iS5 = r20; + m128iS6 = r24; + m128iS7 = r28; + m128iS8 = r32; + m128iS9 = r36; + m128iS10 = r40; + m128iS11 = r44; + m128iS12 = r48; + m128iS13 = r52; + m128iS14 = r56; + m128iS15 = r60; + m128iS16 = r64; + m128iS17 = r68; + m128iS18 = r72; + m128iS19 = r76; + m128iS20 = r80; + m128iS21 = r84; + m128iS22 = r88; + m128iS23 = r92; + m128iS24 = r96; + m128iS25 = r100; + m128iS26 = r104; + m128iS27 = r108; + m128iS28 = r112; + m128iS29 = r116; + m128iS30 = r120; + m128iS31 =r124; + shift = shift_2nd; + m128iAdd = _mm_set1_epi32(add_2nd); + + + } + + } else { + + //Transpose Matrix + + E0l= _mm_unpacklo_epi16(m128iS0,m128iS1); + E1l= _mm_unpacklo_epi16(m128iS2,m128iS3); + E2l= _mm_unpacklo_epi16(m128iS4,m128iS5); + E3l= _mm_unpacklo_epi16(m128iS6,m128iS7); + E4l= _mm_unpacklo_epi16(m128iS8,m128iS9); + E5l= _mm_unpacklo_epi16(m128iS10,m128iS11); + E6l= _mm_unpacklo_epi16(m128iS12,m128iS13); + E7l= _mm_unpacklo_epi16(m128iS14,m128iS15); + E8l= _mm_unpacklo_epi16(m128iS16,m128iS17); + E9l= _mm_unpacklo_epi16(m128iS18,m128iS19); + E10l= _mm_unpacklo_epi16(m128iS20,m128iS21); + E11l= _mm_unpacklo_epi16(m128iS22,m128iS23); + E12l= _mm_unpacklo_epi16(m128iS24,m128iS25); + E13l= _mm_unpacklo_epi16(m128iS26,m128iS27); + E14l= _mm_unpacklo_epi16(m128iS28,m128iS29); + E15l= _mm_unpacklo_epi16(m128iS30,m128iS31); + + + E0h= _mm_unpackhi_epi16(m128iS0,m128iS1); + E1h= _mm_unpackhi_epi16(m128iS2,m128iS3); + E2h= _mm_unpackhi_epi16(m128iS4,m128iS5); + E3h= _mm_unpackhi_epi16(m128iS6,m128iS7); + E4h= _mm_unpackhi_epi16(m128iS8,m128iS9); + E5h= _mm_unpackhi_epi16(m128iS10,m128iS11); + E6h= _mm_unpackhi_epi16(m128iS12,m128iS13); + E7h= _mm_unpackhi_epi16(m128iS14,m128iS15); + E8h= _mm_unpackhi_epi16(m128iS16,m128iS17); + E9h= _mm_unpackhi_epi16(m128iS18,m128iS19); + E10h= _mm_unpackhi_epi16(m128iS20,m128iS21); + E11h= _mm_unpackhi_epi16(m128iS22,m128iS23); + E12h= _mm_unpackhi_epi16(m128iS24,m128iS25); + E13h= _mm_unpackhi_epi16(m128iS26,m128iS27); + E14h= _mm_unpackhi_epi16(m128iS28,m128iS29); + E15h= _mm_unpackhi_epi16(m128iS30,m128iS31); + + m128Tmp0= _mm_unpacklo_epi32(E0l,E1l); + m128Tmp1= _mm_unpacklo_epi32(E2l,E3l); + m128Tmp2= _mm_unpacklo_epi32(E4l,E5l); + m128Tmp3= _mm_unpacklo_epi32(E6l,E7l); + m128Tmp4= _mm_unpacklo_epi32(E8l,E9l); + m128Tmp5= _mm_unpacklo_epi32(E10l,E11l); + m128Tmp6= _mm_unpacklo_epi32(E12l,E13l); + m128Tmp7= _mm_unpacklo_epi32(E14l,E15l); + + m128iS0= _mm_unpacklo_epi64(m128Tmp0,m128Tmp1); //first quarter 1st row + m128iS1= _mm_unpacklo_epi64(m128Tmp2,m128Tmp3); //second quarter 1st row + + + m128iS2= _mm_unpacklo_epi64(m128Tmp4,m128Tmp5); //third quarter 1st row + m128iS3= _mm_unpacklo_epi64(m128Tmp6,m128Tmp7); //last quarter 1st row + + //second row + + m128iS4= _mm_unpackhi_epi64(m128Tmp0,m128Tmp1); //first quarter + m128iS5= _mm_unpackhi_epi64(m128Tmp2,m128Tmp3); //second quarter + + m128iS6= _mm_unpackhi_epi64(m128Tmp4,m128Tmp5); //third quarter + m128iS7= _mm_unpackhi_epi64(m128Tmp6,m128Tmp7); //last quarter + + //third row + + m128Tmp0= _mm_unpackhi_epi32(E0l,E1l); + m128Tmp1= _mm_unpackhi_epi32(E2l,E3l); + m128Tmp2= _mm_unpackhi_epi32(E4l,E5l); + m128Tmp3= _mm_unpackhi_epi32(E6l,E7l); + m128Tmp4= _mm_unpackhi_epi32(E8l,E9l); + m128Tmp5= _mm_unpackhi_epi32(E10l,E11l); + m128Tmp6= _mm_unpackhi_epi32(E12l,E13l); + m128Tmp7= _mm_unpackhi_epi32(E14l,E15l); + + + m128iS8= _mm_unpacklo_epi64(m128Tmp0,m128Tmp1); //first quarter + m128iS9= _mm_unpacklo_epi64(m128Tmp2,m128Tmp3); //second quarter + + m128iS10= _mm_unpacklo_epi64(m128Tmp4,m128Tmp5); //third quarter + m128iS11= _mm_unpacklo_epi64(m128Tmp6,m128Tmp7); //last quarter + + //fourth row + + m128iS12= _mm_unpackhi_epi64(m128Tmp0,m128Tmp1); //first quarter + m128iS13= _mm_unpackhi_epi64(m128Tmp2,m128Tmp3); //second quarter + + m128iS14= _mm_unpackhi_epi64(m128Tmp4,m128Tmp5); //third quarter + m128iS15= _mm_unpackhi_epi64(m128Tmp6,m128Tmp7); //last quarter + + //fith row + + m128Tmp0= _mm_unpacklo_epi32(E0h,E1h); + m128Tmp1= _mm_unpacklo_epi32(E2h,E3h); + m128Tmp2= _mm_unpacklo_epi32(E4h,E5h); + m128Tmp3= _mm_unpacklo_epi32(E6h,E7h); + m128Tmp4= _mm_unpacklo_epi32(E8h,E9h); + m128Tmp5= _mm_unpacklo_epi32(E10h,E11h); + m128Tmp6= _mm_unpacklo_epi32(E12h,E13h); + m128Tmp7= _mm_unpacklo_epi32(E14h,E15h); + + m128iS16= _mm_unpacklo_epi64(m128Tmp0,m128Tmp1); //first quarter + m128iS17= _mm_unpacklo_epi64(m128Tmp2,m128Tmp3); //second quarter + + + m128iS18= _mm_unpacklo_epi64(m128Tmp4,m128Tmp5); //third quarter + m128iS19= _mm_unpacklo_epi64(m128Tmp6,m128Tmp7); + + //sixth row + + m128iS20= _mm_unpackhi_epi64(m128Tmp0,m128Tmp1); //first quarter + m128iS21= _mm_unpackhi_epi64(m128Tmp2,m128Tmp3); //second quarter + + + m128iS22= _mm_unpackhi_epi64(m128Tmp4,m128Tmp5); //third quarter + m128iS23= _mm_unpackhi_epi64(m128Tmp6,m128Tmp7); //last quarter + + //seventh row + + m128Tmp0= _mm_unpackhi_epi32(E0h,E1h); + m128Tmp1= _mm_unpackhi_epi32(E2h,E3h); + m128Tmp2= _mm_unpackhi_epi32(E4h,E5h); + m128Tmp3= _mm_unpackhi_epi32(E6h,E7h); + m128Tmp4= _mm_unpackhi_epi32(E8h,E9h); + m128Tmp5= _mm_unpackhi_epi32(E10h,E11h); + m128Tmp6= _mm_unpackhi_epi32(E12h,E13h); + m128Tmp7= _mm_unpackhi_epi32(E14h,E15h); + + + m128iS24= _mm_unpacklo_epi64(m128Tmp0,m128Tmp1); //first quarter + m128iS25= _mm_unpacklo_epi64(m128Tmp2,m128Tmp3); //second quarter + + + m128iS26= _mm_unpacklo_epi64(m128Tmp4,m128Tmp5); //third quarter + m128iS27= _mm_unpacklo_epi64(m128Tmp6,m128Tmp7); //last quarter + + //last row + + + m128iS28= _mm_unpackhi_epi64(m128Tmp0,m128Tmp1); //first quarter + m128iS29= _mm_unpackhi_epi64(m128Tmp2,m128Tmp3); //second quarter + + m128iS30= _mm_unpackhi_epi64(m128Tmp4,m128Tmp5); //third quarter + m128iS31= _mm_unpackhi_epi64(m128Tmp6,m128Tmp7); //last quarter + + + m128Tmp0=_mm_setzero_si128(); + + + //store + dst = (uint8_t*) _dst + i*stride; + + + E0l= _mm_load_si128((__m128i*)dst); //16 values + E1l= _mm_load_si128((__m128i*)(dst+16)); + E2l= _mm_load_si128((__m128i*)(dst+stride)); + E3l= _mm_load_si128((__m128i*)(dst+stride+16)); + E4l= _mm_load_si128((__m128i*)(dst+2*stride)); + E5l= _mm_load_si128((__m128i*)(dst+2*stride+16)); + E6l= _mm_load_si128((__m128i*)(dst+3*stride)); + E7l= _mm_load_si128((__m128i*)(dst+3*stride+16)); + E8l= _mm_load_si128((__m128i*)(dst+4*stride)); + E9l= _mm_load_si128((__m128i*)(dst+4*stride+16)); + E10l= _mm_load_si128((__m128i*)(dst+5*stride)); + E11l= _mm_load_si128((__m128i*)(dst+5*stride+16)); + E12l= _mm_load_si128((__m128i*)(dst+6*stride)); + E13l= _mm_load_si128((__m128i*)(dst+6*stride+16)); + E14l= _mm_load_si128((__m128i*)(dst+7*stride)); + E15l= _mm_load_si128((__m128i*)(dst+7*stride+16)); + + m128iS0= _mm_adds_epi16(m128iS0,_mm_unpacklo_epi8(E0l,m128Tmp0)); + m128iS1= _mm_adds_epi16(m128iS1,_mm_unpackhi_epi8(E0l,m128Tmp0)); + m128iS0= _mm_packus_epi16(m128iS0,m128iS1); + + m128iS2= _mm_adds_epi16(m128iS2,_mm_unpacklo_epi8(E1l,m128Tmp0)); + m128iS3= _mm_adds_epi16(m128iS3,_mm_unpackhi_epi8(E1l,m128Tmp0)); + m128iS2= _mm_packus_epi16(m128iS2,m128iS3); + + m128iS4= _mm_adds_epi16(m128iS4,_mm_unpacklo_epi8(E2l,m128Tmp0)); + m128iS5= _mm_adds_epi16(m128iS5,_mm_unpackhi_epi8(E2l,m128Tmp0)); + m128iS4= _mm_packus_epi16(m128iS4,m128iS5); + + m128iS6= _mm_adds_epi16(m128iS6,_mm_unpacklo_epi8(E3l,m128Tmp0)); + m128iS7= _mm_adds_epi16(m128iS7,_mm_unpackhi_epi8(E3l,m128Tmp0)); + m128iS6= _mm_packus_epi16(m128iS6,m128iS7); + + m128iS8= _mm_adds_epi16(m128iS8,_mm_unpacklo_epi8(E4l,m128Tmp0)); + m128iS9= _mm_adds_epi16(m128iS9,_mm_unpackhi_epi8(E4l,m128Tmp0)); + m128iS8= _mm_packus_epi16(m128iS8,m128iS9); + + m128iS10= _mm_adds_epi16(m128iS10,_mm_unpacklo_epi8(E5l,m128Tmp0)); + m128iS11= _mm_adds_epi16(m128iS11,_mm_unpackhi_epi8(E5l,m128Tmp0)); + m128iS10= _mm_packus_epi16(m128iS10,m128iS11); + + m128iS12= _mm_adds_epi16(m128iS12,_mm_unpacklo_epi8(E6l,m128Tmp0)); + m128iS13= _mm_adds_epi16(m128iS13,_mm_unpackhi_epi8(E6l,m128Tmp0)); + m128iS12= _mm_packus_epi16(m128iS12,m128iS13); + + m128iS14= _mm_adds_epi16(m128iS14,_mm_unpacklo_epi8(E7l,m128Tmp0)); + m128iS15= _mm_adds_epi16(m128iS15,_mm_unpackhi_epi8(E7l,m128Tmp0)); + m128iS14= _mm_packus_epi16(m128iS14,m128iS15); + + m128iS16= _mm_adds_epi16(m128iS16,_mm_unpacklo_epi8(E8l,m128Tmp0)); + m128iS17= _mm_adds_epi16(m128iS17,_mm_unpackhi_epi8(E8l,m128Tmp0)); + m128iS16= _mm_packus_epi16(m128iS16,m128iS17); + + m128iS18= _mm_adds_epi16(m128iS18,_mm_unpacklo_epi8(E9l,m128Tmp0)); + m128iS19= _mm_adds_epi16(m128iS19,_mm_unpackhi_epi8(E9l,m128Tmp0)); + m128iS18= _mm_packus_epi16(m128iS18,m128iS19); + + m128iS20= _mm_adds_epi16(m128iS20,_mm_unpacklo_epi8(E10l,m128Tmp0)); + m128iS21= _mm_adds_epi16(m128iS21,_mm_unpackhi_epi8(E10l,m128Tmp0)); + m128iS20= _mm_packus_epi16(m128iS20,m128iS21); + + m128iS22= _mm_adds_epi16(m128iS22,_mm_unpacklo_epi8(E11l,m128Tmp0)); + m128iS23= _mm_adds_epi16(m128iS23,_mm_unpackhi_epi8(E11l,m128Tmp0)); + m128iS22= _mm_packus_epi16(m128iS22,m128iS23); + + m128iS24= _mm_adds_epi16(m128iS24,_mm_unpacklo_epi8(E12l,m128Tmp0)); + m128iS25= _mm_adds_epi16(m128iS25,_mm_unpackhi_epi8(E12l,m128Tmp0)); + m128iS24= _mm_packus_epi16(m128iS24,m128iS25); + + m128iS26= _mm_adds_epi16(m128iS26,_mm_unpacklo_epi8(E13l,m128Tmp0)); + m128iS27= _mm_adds_epi16(m128iS27,_mm_unpackhi_epi8(E13l,m128Tmp0)); + m128iS26= _mm_packus_epi16(m128iS26,m128iS27); + + m128iS28= _mm_adds_epi16(m128iS28,_mm_unpacklo_epi8(E14l,m128Tmp0)); + m128iS29= _mm_adds_epi16(m128iS29,_mm_unpackhi_epi8(E14l,m128Tmp0)); + m128iS28= _mm_packus_epi16(m128iS28,m128iS29); + + m128iS30= _mm_adds_epi16(m128iS30,_mm_unpacklo_epi8(E15l,m128Tmp0)); + m128iS31= _mm_adds_epi16(m128iS31,_mm_unpackhi_epi8(E15l,m128Tmp0)); + m128iS30= _mm_packus_epi16(m128iS30,m128iS31); + + + _mm_store_si128((__m128i*)dst,m128iS0); + _mm_store_si128((__m128i*)(dst+16),m128iS2); + _mm_store_si128((__m128i*)(dst+stride),m128iS4); + _mm_store_si128((__m128i*)(dst+stride+16),m128iS6); + _mm_store_si128((__m128i*)(dst+2*stride),m128iS8); + _mm_store_si128((__m128i*)(dst+2*stride+16),m128iS10); + _mm_store_si128((__m128i*)(dst+3*stride),m128iS12); + _mm_store_si128((__m128i*)(dst+3*stride+16),m128iS14); + _mm_store_si128((__m128i*)(dst+4*stride),m128iS16); + _mm_store_si128((__m128i*)(dst+4*stride+16),m128iS18); + _mm_store_si128((__m128i*)(dst+5*stride),m128iS20); + _mm_store_si128((__m128i*)(dst+5*stride+16),m128iS22); + _mm_store_si128((__m128i*)(dst+6*stride),m128iS24); + _mm_store_si128((__m128i*)(dst+6*stride+16),m128iS26); + _mm_store_si128((__m128i*)(dst+7*stride),m128iS28); + _mm_store_si128((__m128i*)(dst+7*stride+16),m128iS30); + + + if(i==0){ + //load next values : + m128iS0 = r1; + m128iS1 = r5; + m128iS2 = r9; + m128iS3 = r13; + m128iS4 = r17; + m128iS5 = r21; + m128iS6 = r25; + m128iS7 = r29; + m128iS8 = r33; + m128iS9 = r37; + m128iS10 = r41; + m128iS11 = r45; + m128iS12 = r49; + m128iS13 = r53; + m128iS14 = r57; + m128iS15 = r61; + m128iS16 = r65; + m128iS17 = r69; + m128iS18 = r73; + m128iS19 = r77; + m128iS20 = r81; + m128iS21 = r85; + m128iS22 = r89; + m128iS23 = r93; + m128iS24 = r97; + m128iS25 = r101; + m128iS26 = r105; + m128iS27 = r109; + m128iS28 = r113; + m128iS29 = r117; + m128iS30 = r121; + m128iS31 =r125; + + }else if(i ==8){ + //load next values : + m128iS0 = r2; + m128iS1 = r6; + m128iS2 = r10; + m128iS3 = r14; + m128iS4 = r18; + m128iS5 = r22; + m128iS6 = r26; + m128iS7 = r30; + m128iS8 = r34; + m128iS9 = r38; + m128iS10 = r42; + m128iS11 = r46; + m128iS12 = r50; + m128iS13 = r54; + m128iS14 = r58; + m128iS15 = r62; + m128iS16 = r66; + m128iS17 = r70; + m128iS18 = r74; + m128iS19 = r78; + m128iS20 = r82; + m128iS21 = r86; + m128iS22 = r90; + m128iS23 = r94; + m128iS24 = r98; + m128iS25 = r102; + m128iS26 = r106; + m128iS27 = r110; + m128iS28 = r114; + m128iS29 = r118; + m128iS30 = r122; + m128iS31 =r126; + + }else if(i==16) + { + //load next values : + m128iS0 = r3; + m128iS1 = r7; + m128iS2 = r11; + m128iS3 = r15; + m128iS4 = r19; + m128iS5 = r23; + m128iS6 = r27; + m128iS7 = r31; + m128iS8 = r35; + m128iS9 = r39; + m128iS10 = r43; + m128iS11 = r47; + m128iS12 = r51; + m128iS13 = r55; + m128iS14 = r59; + m128iS15 = r63; + m128iS16 = r67; + m128iS17 = r71; + m128iS18 = r75; + m128iS19 = r79; + m128iS20 = r83; + m128iS21 = r87; + m128iS22 = r91; + m128iS23 = r95; + m128iS24 = r99; + m128iS25 = r103; + m128iS26 = r107; + m128iS27 = r111; + m128iS28 = r115; + m128iS29 = r119; + m128iS30 = r123; + m128iS31 =r127; + } + } + } + } +} +#endif + + +#if 0 +void ff_hevc_transform_32x32_add_10_sse4(uint8_t *_dst, const int16_t *coeffs, + ptrdiff_t _stride) { + int i, j; + uint16_t *dst = (uint16_t*) _dst; + ptrdiff_t stride = _stride / 2; + int shift; + uint8_t shift_2nd = 10; //20 - bit depth + uint16_t add_2nd = 1<<9; //shift2 - 1 + int16_t *src = coeffs; + + __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6, + m128iS7, m128iS8, m128iS9, m128iS10, m128iS11, m128iS12, m128iS13, + m128iS14, m128iS15, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2, + m128Tmp3, m128Tmp4, m128Tmp5, m128Tmp6, m128Tmp7, E0h, E1h, E2h, + E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O4h, O5h, O6h, O7h, + O0l, O1l, O2l, O3l, O4l, O5l, O6l, O7l, EE0l, EE1l, EE2l, EE3l, + E00l, E01l, EE0h, EE1h, EE2h, EE3h, E00h, E01h; + __m128i E4l, E5l, E6l, E7l, E8l, E9l, E10l, E11l, E12l, E13l, E14l, E15l; + __m128i E4h, E5h, E6h, E7h, E8h, E9h, E10h, E11h, E12h, E13h, E14h, E15h, + EEE0l, EEE1l, EEE0h, EEE1h; + __m128i m128iS16, m128iS17, m128iS18, m128iS19, m128iS20, m128iS21, + m128iS22, m128iS23, m128iS24, m128iS25, m128iS26, m128iS27, + m128iS28, m128iS29, m128iS30, m128iS31, m128Tmp8, m128Tmp9, + m128Tmp10, m128Tmp11, m128Tmp12, m128Tmp13, m128Tmp14, m128Tmp15, + O8h, O9h, O10h, O11h, O12h, O13h, O14h, O15h, O8l, O9l, O10l, O11l, + O12l, O13l, O14l, O15l, E02l, E02h, E03l, E03h, EE7l, EE6l, EE5l, + EE4l, EE7h, EE6h, EE5h, EE4h; + m128iS0 = _mm_load_si128((__m128i *) (src)); + m128iS1 = _mm_load_si128((__m128i *) (src + 32)); + m128iS2 = _mm_load_si128((__m128i *) (src + 64)); + m128iS3 = _mm_load_si128((__m128i *) (src + 96)); + m128iS4 = _mm_loadu_si128((__m128i *) (src + 128)); + m128iS5 = _mm_load_si128((__m128i *) (src + 160)); + m128iS6 = _mm_load_si128((__m128i *) (src + 192)); + m128iS7 = _mm_load_si128((__m128i *) (src + 224)); + m128iS8 = _mm_load_si128((__m128i *) (src + 256)); + m128iS9 = _mm_load_si128((__m128i *) (src + 288)); + m128iS10 = _mm_load_si128((__m128i *) (src + 320)); + m128iS11 = _mm_load_si128((__m128i *) (src + 352)); + m128iS12 = _mm_loadu_si128((__m128i *) (src + 384)); + m128iS13 = _mm_load_si128((__m128i *) (src + 416)); + m128iS14 = _mm_load_si128((__m128i *) (src + 448)); + m128iS15 = _mm_load_si128((__m128i *) (src + 480)); + m128iS16 = _mm_load_si128((__m128i *) (src + 512)); + m128iS17 = _mm_load_si128((__m128i *) (src + 544)); + m128iS18 = _mm_load_si128((__m128i *) (src + 576)); + m128iS19 = _mm_load_si128((__m128i *) (src + 608)); + m128iS20 = _mm_load_si128((__m128i *) (src + 640)); + m128iS21 = _mm_load_si128((__m128i *) (src + 672)); + m128iS22 = _mm_load_si128((__m128i *) (src + 704)); + m128iS23 = _mm_load_si128((__m128i *) (src + 736)); + m128iS24 = _mm_load_si128((__m128i *) (src + 768)); + m128iS25 = _mm_load_si128((__m128i *) (src + 800)); + m128iS26 = _mm_load_si128((__m128i *) (src + 832)); + m128iS27 = _mm_load_si128((__m128i *) (src + 864)); + m128iS28 = _mm_load_si128((__m128i *) (src + 896)); + m128iS29 = _mm_load_si128((__m128i *) (src + 928)); + m128iS30 = _mm_load_si128((__m128i *) (src + 960)); + m128iS31 = _mm_load_si128((__m128i *) (src + 992)); + + shift = shift_1st; + m128iAdd = _mm_set1_epi32(add_1st); + + for (j = 0; j < 2; j++) { + for (i = 0; i < 32; i += 8) { + m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3); + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][0]))); + m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][0]))); + + m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][0]))); + m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][0]))); + + m128Tmp4 = _mm_unpacklo_epi16(m128iS9, m128iS11); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][0]))); + m128Tmp5 = _mm_unpackhi_epi16(m128iS9, m128iS11); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][0]))); + + m128Tmp6 = _mm_unpacklo_epi16(m128iS13, m128iS15); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][0]))); + m128Tmp7 = _mm_unpackhi_epi16(m128iS13, m128iS15); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][0]))); + + m128Tmp8 = _mm_unpacklo_epi16(m128iS17, m128iS19); + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][0]))); + m128Tmp9 = _mm_unpackhi_epi16(m128iS17, m128iS19); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][0]))); + + m128Tmp10 = _mm_unpacklo_epi16(m128iS21, m128iS23); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][0]))); + m128Tmp11 = _mm_unpackhi_epi16(m128iS21, m128iS23); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][0]))); + + m128Tmp12 = _mm_unpacklo_epi16(m128iS25, m128iS27); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][0]))); + m128Tmp13 = _mm_unpackhi_epi16(m128iS25, m128iS27); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][0]))); + + m128Tmp14 = _mm_unpacklo_epi16(m128iS29, m128iS31); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][0]))); + m128Tmp15 = _mm_unpackhi_epi16(m128iS29, m128iS31); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][0]))); + + O0l = _mm_add_epi32(E0l, E1l); + O0l = _mm_add_epi32(O0l, E2l); + O0l = _mm_add_epi32(O0l, E3l); + O0l = _mm_add_epi32(O0l, E4l); + O0l = _mm_add_epi32(O0l, E5l); + O0l = _mm_add_epi32(O0l, E6l); + O0l = _mm_add_epi32(O0l, E7l); + + O0h = _mm_add_epi32(E0h, E1h); + O0h = _mm_add_epi32(O0h, E2h); + O0h = _mm_add_epi32(O0h, E3h); + O0h = _mm_add_epi32(O0h, E4h); + O0h = _mm_add_epi32(O0h, E5h); + O0h = _mm_add_epi32(O0h, E6h); + O0h = _mm_add_epi32(O0h, E7h); + + /* Compute O1*/ + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][1]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][1]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][1]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][1]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][1]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][1]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][1]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][1]))); + + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][1]))); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][1]))); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][1]))); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][1]))); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][1]))); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][1]))); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][1]))); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][1]))); + + O1l = _mm_add_epi32(E0l, E1l); + O1l = _mm_add_epi32(O1l, E2l); + O1l = _mm_add_epi32(O1l, E3l); + O1l = _mm_add_epi32(O1l, E4l); + O1l = _mm_add_epi32(O1l, E5l); + O1l = _mm_add_epi32(O1l, E6l); + O1l = _mm_add_epi32(O1l, E7l); + + O1h = _mm_add_epi32(E0h, E1h); + O1h = _mm_add_epi32(O1h, E2h); + O1h = _mm_add_epi32(O1h, E3h); + O1h = _mm_add_epi32(O1h, E4h); + O1h = _mm_add_epi32(O1h, E5h); + O1h = _mm_add_epi32(O1h, E6h); + O1h = _mm_add_epi32(O1h, E7h); + /* Compute O2*/ + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][2]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][2]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][2]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][2]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][2]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][2]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][2]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][2]))); + + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][2]))); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][2]))); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][2]))); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][2]))); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][2]))); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][2]))); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][2]))); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][2]))); + + O2l = _mm_add_epi32(E0l, E1l); + O2l = _mm_add_epi32(O2l, E2l); + O2l = _mm_add_epi32(O2l, E3l); + O2l = _mm_add_epi32(O2l, E4l); + O2l = _mm_add_epi32(O2l, E5l); + O2l = _mm_add_epi32(O2l, E6l); + O2l = _mm_add_epi32(O2l, E7l); + + O2h = _mm_add_epi32(E0h, E1h); + O2h = _mm_add_epi32(O2h, E2h); + O2h = _mm_add_epi32(O2h, E3h); + O2h = _mm_add_epi32(O2h, E4h); + O2h = _mm_add_epi32(O2h, E5h); + O2h = _mm_add_epi32(O2h, E6h); + O2h = _mm_add_epi32(O2h, E7h); + /* Compute O3*/ + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][3]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][3]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][3]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][3]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][3]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][3]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][3]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][3]))); + + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][3]))); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][3]))); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][3]))); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][3]))); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][3]))); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][3]))); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][3]))); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][3]))); + + O3l = _mm_add_epi32(E0l, E1l); + O3l = _mm_add_epi32(O3l, E2l); + O3l = _mm_add_epi32(O3l, E3l); + O3l = _mm_add_epi32(O3l, E4l); + O3l = _mm_add_epi32(O3l, E5l); + O3l = _mm_add_epi32(O3l, E6l); + O3l = _mm_add_epi32(O3l, E7l); + + O3h = _mm_add_epi32(E0h, E1h); + O3h = _mm_add_epi32(O3h, E2h); + O3h = _mm_add_epi32(O3h, E3h); + O3h = _mm_add_epi32(O3h, E4h); + O3h = _mm_add_epi32(O3h, E5h); + O3h = _mm_add_epi32(O3h, E6h); + O3h = _mm_add_epi32(O3h, E7h); + /* Compute O4*/ + + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][4]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][4]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][4]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][4]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][4]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][4]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][4]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][4]))); + + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][4]))); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][4]))); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][4]))); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][4]))); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][4]))); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][4]))); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][4]))); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][4]))); + + O4l = _mm_add_epi32(E0l, E1l); + O4l = _mm_add_epi32(O4l, E2l); + O4l = _mm_add_epi32(O4l, E3l); + O4l = _mm_add_epi32(O4l, E4l); + O4l = _mm_add_epi32(O4l, E5l); + O4l = _mm_add_epi32(O4l, E6l); + O4l = _mm_add_epi32(O4l, E7l); + + O4h = _mm_add_epi32(E0h, E1h); + O4h = _mm_add_epi32(O4h, E2h); + O4h = _mm_add_epi32(O4h, E3h); + O4h = _mm_add_epi32(O4h, E4h); + O4h = _mm_add_epi32(O4h, E5h); + O4h = _mm_add_epi32(O4h, E6h); + O4h = _mm_add_epi32(O4h, E7h); + + /* Compute O5*/ + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][5]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][5]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][5]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][5]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][5]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][5]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][5]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][5]))); + + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][5]))); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][5]))); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][5]))); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][5]))); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][5]))); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][5]))); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][5]))); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][5]))); + + O5l = _mm_add_epi32(E0l, E1l); + O5l = _mm_add_epi32(O5l, E2l); + O5l = _mm_add_epi32(O5l, E3l); + O5l = _mm_add_epi32(O5l, E4l); + O5l = _mm_add_epi32(O5l, E5l); + O5l = _mm_add_epi32(O5l, E6l); + O5l = _mm_add_epi32(O5l, E7l); + + O5h = _mm_add_epi32(E0h, E1h); + O5h = _mm_add_epi32(O5h, E2h); + O5h = _mm_add_epi32(O5h, E3h); + O5h = _mm_add_epi32(O5h, E4h); + O5h = _mm_add_epi32(O5h, E5h); + O5h = _mm_add_epi32(O5h, E6h); + O5h = _mm_add_epi32(O5h, E7h); + + /* Compute O6*/ + + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][6]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][6]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][6]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][6]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][6]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][6]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][6]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][6]))); + + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][6]))); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][6]))); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][6]))); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][6]))); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][6]))); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][6]))); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][6]))); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][6]))); + + O6l = _mm_add_epi32(E0l, E1l); + O6l = _mm_add_epi32(O6l, E2l); + O6l = _mm_add_epi32(O6l, E3l); + O6l = _mm_add_epi32(O6l, E4l); + O6l = _mm_add_epi32(O6l, E5l); + O6l = _mm_add_epi32(O6l, E6l); + O6l = _mm_add_epi32(O6l, E7l); + + O6h = _mm_add_epi32(E0h, E1h); + O6h = _mm_add_epi32(O6h, E2h); + O6h = _mm_add_epi32(O6h, E3h); + O6h = _mm_add_epi32(O6h, E4h); + O6h = _mm_add_epi32(O6h, E5h); + O6h = _mm_add_epi32(O6h, E6h); + O6h = _mm_add_epi32(O6h, E7h); + + /* Compute O7*/ + + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][7]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][7]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][7]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][7]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][7]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][7]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][7]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][7]))); + + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][7]))); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][7]))); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][7]))); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][7]))); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][7]))); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][7]))); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][7]))); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][7]))); + + O7l = _mm_add_epi32(E0l, E1l); + O7l = _mm_add_epi32(O7l, E2l); + O7l = _mm_add_epi32(O7l, E3l); + O7l = _mm_add_epi32(O7l, E4l); + O7l = _mm_add_epi32(O7l, E5l); + O7l = _mm_add_epi32(O7l, E6l); + O7l = _mm_add_epi32(O7l, E7l); + + O7h = _mm_add_epi32(E0h, E1h); + O7h = _mm_add_epi32(O7h, E2h); + O7h = _mm_add_epi32(O7h, E3h); + O7h = _mm_add_epi32(O7h, E4h); + O7h = _mm_add_epi32(O7h, E5h); + O7h = _mm_add_epi32(O7h, E6h); + O7h = _mm_add_epi32(O7h, E7h); + + /* Compute O8*/ + + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][8]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][8]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][8]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][8]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][8]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][8]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][8]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][8]))); + + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][8]))); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][8]))); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][8]))); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][8]))); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][8]))); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][8]))); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][8]))); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][8]))); + + O8l = _mm_add_epi32(E0l, E1l); + O8l = _mm_add_epi32(O8l, E2l); + O8l = _mm_add_epi32(O8l, E3l); + O8l = _mm_add_epi32(O8l, E4l); + O8l = _mm_add_epi32(O8l, E5l); + O8l = _mm_add_epi32(O8l, E6l); + O8l = _mm_add_epi32(O8l, E7l); + + O8h = _mm_add_epi32(E0h, E1h); + O8h = _mm_add_epi32(O8h, E2h); + O8h = _mm_add_epi32(O8h, E3h); + O8h = _mm_add_epi32(O8h, E4h); + O8h = _mm_add_epi32(O8h, E5h); + O8h = _mm_add_epi32(O8h, E6h); + O8h = _mm_add_epi32(O8h, E7h); + + /* Compute O9*/ + + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][9]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][9]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][9]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][9]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][9]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][9]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][9]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][9]))); + + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][9]))); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][9]))); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][9]))); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][9]))); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][9]))); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][9]))); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][9]))); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][9]))); + + O9l = _mm_add_epi32(E0l, E1l); + O9l = _mm_add_epi32(O9l, E2l); + O9l = _mm_add_epi32(O9l, E3l); + O9l = _mm_add_epi32(O9l, E4l); + O9l = _mm_add_epi32(O9l, E5l); + O9l = _mm_add_epi32(O9l, E6l); + O9l = _mm_add_epi32(O9l, E7l); + + O9h = _mm_add_epi32(E0h, E1h); + O9h = _mm_add_epi32(O9h, E2h); + O9h = _mm_add_epi32(O9h, E3h); + O9h = _mm_add_epi32(O9h, E4h); + O9h = _mm_add_epi32(O9h, E5h); + O9h = _mm_add_epi32(O9h, E6h); + O9h = _mm_add_epi32(O9h, E7h); + + /* Compute 10*/ + + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][10]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][10]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][10]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][10]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][10]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][10]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][10]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][10]))); + + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][10]))); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][10]))); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][10]))); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][10]))); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][10]))); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][10]))); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][10]))); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][10]))); + + O10l = _mm_add_epi32(E0l, E1l); + O10l = _mm_add_epi32(O10l, E2l); + O10l = _mm_add_epi32(O10l, E3l); + O10l = _mm_add_epi32(O10l, E4l); + O10l = _mm_add_epi32(O10l, E5l); + O10l = _mm_add_epi32(O10l, E6l); + O10l = _mm_add_epi32(O10l, E7l); + + O10h = _mm_add_epi32(E0h, E1h); + O10h = _mm_add_epi32(O10h, E2h); + O10h = _mm_add_epi32(O10h, E3h); + O10h = _mm_add_epi32(O10h, E4h); + O10h = _mm_add_epi32(O10h, E5h); + O10h = _mm_add_epi32(O10h, E6h); + O10h = _mm_add_epi32(O10h, E7h); + + /* Compute 11*/ + + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][11]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][11]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][11]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][11]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][11]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][11]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][11]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][11]))); + + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][11]))); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][11]))); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][11]))); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][11]))); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][11]))); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][11]))); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][11]))); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][11]))); + + O11l = _mm_add_epi32(E0l, E1l); + O11l = _mm_add_epi32(O11l, E2l); + O11l = _mm_add_epi32(O11l, E3l); + O11l = _mm_add_epi32(O11l, E4l); + O11l = _mm_add_epi32(O11l, E5l); + O11l = _mm_add_epi32(O11l, E6l); + O11l = _mm_add_epi32(O11l, E7l); + + O11h = _mm_add_epi32(E0h, E1h); + O11h = _mm_add_epi32(O11h, E2h); + O11h = _mm_add_epi32(O11h, E3h); + O11h = _mm_add_epi32(O11h, E4h); + O11h = _mm_add_epi32(O11h, E5h); + O11h = _mm_add_epi32(O11h, E6h); + O11h = _mm_add_epi32(O11h, E7h); + + /* Compute 12*/ + + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][12]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][12]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][12]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][12]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][12]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][12]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][12]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][12]))); + + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][12]))); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][12]))); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][12]))); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][12]))); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][12]))); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][12]))); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][12]))); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][12]))); + + O12l = _mm_add_epi32(E0l, E1l); + O12l = _mm_add_epi32(O12l, E2l); + O12l = _mm_add_epi32(O12l, E3l); + O12l = _mm_add_epi32(O12l, E4l); + O12l = _mm_add_epi32(O12l, E5l); + O12l = _mm_add_epi32(O12l, E6l); + O12l = _mm_add_epi32(O12l, E7l); + + O12h = _mm_add_epi32(E0h, E1h); + O12h = _mm_add_epi32(O12h, E2h); + O12h = _mm_add_epi32(O12h, E3h); + O12h = _mm_add_epi32(O12h, E4h); + O12h = _mm_add_epi32(O12h, E5h); + O12h = _mm_add_epi32(O12h, E6h); + O12h = _mm_add_epi32(O12h, E7h); + + /* Compute 13*/ + + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][13]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][13]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][13]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][13]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][13]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][13]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][13]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][13]))); + + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][13]))); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][13]))); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][13]))); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][13]))); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][13]))); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][13]))); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][13]))); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][13]))); + + O13l = _mm_add_epi32(E0l, E1l); + O13l = _mm_add_epi32(O13l, E2l); + O13l = _mm_add_epi32(O13l, E3l); + O13l = _mm_add_epi32(O13l, E4l); + O13l = _mm_add_epi32(O13l, E5l); + O13l = _mm_add_epi32(O13l, E6l); + O13l = _mm_add_epi32(O13l, E7l); + + O13h = _mm_add_epi32(E0h, E1h); + O13h = _mm_add_epi32(O13h, E2h); + O13h = _mm_add_epi32(O13h, E3h); + O13h = _mm_add_epi32(O13h, E4h); + O13h = _mm_add_epi32(O13h, E5h); + O13h = _mm_add_epi32(O13h, E6h); + O13h = _mm_add_epi32(O13h, E7h); + + /* Compute O14 */ + + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][14]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][14]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][14]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][14]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][14]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][14]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][14]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][14]))); + + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][14]))); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][14]))); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][14]))); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][14]))); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][14]))); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][14]))); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][14]))); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][14]))); + + O14l = _mm_add_epi32(E0l, E1l); + O14l = _mm_add_epi32(O14l, E2l); + O14l = _mm_add_epi32(O14l, E3l); + O14l = _mm_add_epi32(O14l, E4l); + O14l = _mm_add_epi32(O14l, E5l); + O14l = _mm_add_epi32(O14l, E6l); + O14l = _mm_add_epi32(O14l, E7l); + + O14h = _mm_add_epi32(E0h, E1h); + O14h = _mm_add_epi32(O14h, E2h); + O14h = _mm_add_epi32(O14h, E3h); + O14h = _mm_add_epi32(O14h, E4h); + O14h = _mm_add_epi32(O14h, E5h); + O14h = _mm_add_epi32(O14h, E6h); + O14h = _mm_add_epi32(O14h, E7h); + + /* Compute O15*/ + + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][15]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][15]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][15]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][15]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][15]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][15]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][15]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][15]))); + + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][15]))); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][15]))); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][15]))); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][15]))); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][15]))); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][15]))); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][15]))); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][15]))); + + O15l = _mm_add_epi32(E0l, E1l); + O15l = _mm_add_epi32(O15l, E2l); + O15l = _mm_add_epi32(O15l, E3l); + O15l = _mm_add_epi32(O15l, E4l); + O15l = _mm_add_epi32(O15l, E5l); + O15l = _mm_add_epi32(O15l, E6l); + O15l = _mm_add_epi32(O15l, E7l); + + O15h = _mm_add_epi32(E0h, E1h); + O15h = _mm_add_epi32(O15h, E2h); + O15h = _mm_add_epi32(O15h, E3h); + O15h = _mm_add_epi32(O15h, E4h); + O15h = _mm_add_epi32(O15h, E5h); + O15h = _mm_add_epi32(O15h, E6h); + O15h = _mm_add_epi32(O15h, E7h); + /* Compute E0 */ + + m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6); + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_1[0][0]))); + m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_1[0][0]))); + + m128Tmp2 = _mm_unpacklo_epi16(m128iS10, m128iS14); + E0l = _mm_add_epi32(E0l, + _mm_madd_epi16(m128Tmp2, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][0])))); + m128Tmp3 = _mm_unpackhi_epi16(m128iS10, m128iS14); + E0h = _mm_add_epi32(E0h, + _mm_madd_epi16(m128Tmp3, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][0])))); + + m128Tmp4 = _mm_unpacklo_epi16(m128iS18, m128iS22); + E0l = _mm_add_epi32(E0l, + _mm_madd_epi16(m128Tmp4, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][0])))); + m128Tmp5 = _mm_unpackhi_epi16(m128iS18, m128iS22); + E0h = _mm_add_epi32(E0h, + _mm_madd_epi16(m128Tmp5, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][0])))); + + m128Tmp6 = _mm_unpacklo_epi16(m128iS26, m128iS30); + E0l = _mm_add_epi32(E0l, + _mm_madd_epi16(m128Tmp6, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][0])))); + m128Tmp7 = _mm_unpackhi_epi16(m128iS26, m128iS30); + E0h = _mm_add_epi32(E0h, + _mm_madd_epi16(m128Tmp7, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][0])))); + + /* Compute E1 */ + E1l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_1[0][1]))); + E1h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_1[0][1]))); + E1l = _mm_add_epi32(E1l, + _mm_madd_epi16(m128Tmp2, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][1])))); + E1h = _mm_add_epi32(E1h, + _mm_madd_epi16(m128Tmp3, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][1])))); + E1l = _mm_add_epi32(E1l, + _mm_madd_epi16(m128Tmp4, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][1])))); + E1h = _mm_add_epi32(E1h, + _mm_madd_epi16(m128Tmp5, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][1])))); + E1l = _mm_add_epi32(E1l, + _mm_madd_epi16(m128Tmp6, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][1])))); + E1h = _mm_add_epi32(E1h, + _mm_madd_epi16(m128Tmp7, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][1])))); + + /* Compute E2 */ + E2l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_1[0][2]))); + E2h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_1[0][2]))); + E2l = _mm_add_epi32(E2l, + _mm_madd_epi16(m128Tmp2, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][2])))); + E2h = _mm_add_epi32(E2h, + _mm_madd_epi16(m128Tmp3, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][2])))); + E2l = _mm_add_epi32(E2l, + _mm_madd_epi16(m128Tmp4, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][2])))); + E2h = _mm_add_epi32(E2h, + _mm_madd_epi16(m128Tmp5, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][2])))); + E2l = _mm_add_epi32(E2l, + _mm_madd_epi16(m128Tmp6, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][2])))); + E2h = _mm_add_epi32(E2h, + _mm_madd_epi16(m128Tmp7, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][2])))); + + /* Compute E3 */ + E3l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_1[0][3]))); + E3h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_1[0][3]))); + E3l = _mm_add_epi32(E3l, + _mm_madd_epi16(m128Tmp2, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][3])))); + E3h = _mm_add_epi32(E3h, + _mm_madd_epi16(m128Tmp3, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][3])))); + E3l = _mm_add_epi32(E3l, + _mm_madd_epi16(m128Tmp4, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][3])))); + E3h = _mm_add_epi32(E3h, + _mm_madd_epi16(m128Tmp5, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][3])))); + E3l = _mm_add_epi32(E3l, + _mm_madd_epi16(m128Tmp6, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][3])))); + E3h = _mm_add_epi32(E3h, + _mm_madd_epi16(m128Tmp7, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][3])))); + + /* Compute E4 */ + E4l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_1[0][4]))); + E4h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_1[0][4]))); + E4l = _mm_add_epi32(E4l, + _mm_madd_epi16(m128Tmp2, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][4])))); + E4h = _mm_add_epi32(E4h, + _mm_madd_epi16(m128Tmp3, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][4])))); + E4l = _mm_add_epi32(E4l, + _mm_madd_epi16(m128Tmp4, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][4])))); + E4h = _mm_add_epi32(E4h, + _mm_madd_epi16(m128Tmp5, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][4])))); + E4l = _mm_add_epi32(E4l, + _mm_madd_epi16(m128Tmp6, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][4])))); + E4h = _mm_add_epi32(E4h, + _mm_madd_epi16(m128Tmp7, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][4])))); + + /* Compute E3 */ + E5l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_1[0][5]))); + E5h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_1[0][5]))); + E5l = _mm_add_epi32(E5l, + _mm_madd_epi16(m128Tmp2, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][5])))); + E5h = _mm_add_epi32(E5h, + _mm_madd_epi16(m128Tmp3, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][5])))); + E5l = _mm_add_epi32(E5l, + _mm_madd_epi16(m128Tmp4, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][5])))); + E5h = _mm_add_epi32(E5h, + _mm_madd_epi16(m128Tmp5, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][5])))); + E5l = _mm_add_epi32(E5l, + _mm_madd_epi16(m128Tmp6, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][5])))); + E5h = _mm_add_epi32(E5h, + _mm_madd_epi16(m128Tmp7, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][5])))); + + /* Compute E6 */ + E6l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_1[0][6]))); + E6h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_1[0][6]))); + E6l = _mm_add_epi32(E6l, + _mm_madd_epi16(m128Tmp2, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][6])))); + E6h = _mm_add_epi32(E6h, + _mm_madd_epi16(m128Tmp3, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][6])))); + E6l = _mm_add_epi32(E6l, + _mm_madd_epi16(m128Tmp4, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][6])))); + E6h = _mm_add_epi32(E6h, + _mm_madd_epi16(m128Tmp5, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][6])))); + E6l = _mm_add_epi32(E6l, + _mm_madd_epi16(m128Tmp6, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][6])))); + E6h = _mm_add_epi32(E6h, + _mm_madd_epi16(m128Tmp7, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][6])))); + + /* Compute E7 */ + E7l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_1[0][7]))); + E7h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_1[0][7]))); + E7l = _mm_add_epi32(E7l, + _mm_madd_epi16(m128Tmp2, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][7])))); + E7h = _mm_add_epi32(E7h, + _mm_madd_epi16(m128Tmp3, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][7])))); + E7l = _mm_add_epi32(E7l, + _mm_madd_epi16(m128Tmp4, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][7])))); + E7h = _mm_add_epi32(E7h, + _mm_madd_epi16(m128Tmp5, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][7])))); + E7l = _mm_add_epi32(E7l, + _mm_madd_epi16(m128Tmp6, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][7])))); + E7h = _mm_add_epi32(E7h, + _mm_madd_epi16(m128Tmp7, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][7])))); + + /* Compute EE0 and EEE */ + + m128Tmp0 = _mm_unpacklo_epi16(m128iS4, m128iS12); + E00l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_2[0][0]))); + m128Tmp1 = _mm_unpackhi_epi16(m128iS4, m128iS12); + E00h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_2[0][0]))); + + m128Tmp2 = _mm_unpacklo_epi16(m128iS20, m128iS28); + E00l = _mm_add_epi32(E00l, + _mm_madd_epi16(m128Tmp2, + _mm_load_si128( + (__m128i *) (transform16x16_2[1][0])))); + m128Tmp3 = _mm_unpackhi_epi16(m128iS20, m128iS28); + E00h = _mm_add_epi32(E00h, + _mm_madd_epi16(m128Tmp3, + _mm_load_si128( + (__m128i *) (transform16x16_2[1][0])))); + + E01l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_2[0][1]))); + E01h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_2[0][1]))); + E01l = _mm_add_epi32(E01l, + _mm_madd_epi16(m128Tmp2, + _mm_load_si128( + (__m128i *) (transform16x16_2[1][1])))); + E01h = _mm_add_epi32(E01h, + _mm_madd_epi16(m128Tmp3, + _mm_load_si128( + (__m128i *) (transform16x16_2[1][1])))); + + E02l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_2[0][2]))); + E02h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_2[0][2]))); + E02l = _mm_add_epi32(E02l, + _mm_madd_epi16(m128Tmp2, + _mm_load_si128( + (__m128i *) (transform16x16_2[1][2])))); + E02h = _mm_add_epi32(E02h, + _mm_madd_epi16(m128Tmp3, + _mm_load_si128( + (__m128i *) (transform16x16_2[1][2])))); + + E03l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_2[0][3]))); + E03h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_2[0][3]))); + E03l = _mm_add_epi32(E03l, + _mm_madd_epi16(m128Tmp2, + _mm_load_si128( + (__m128i *) (transform16x16_2[1][3])))); + E03h = _mm_add_epi32(E03h, + _mm_madd_epi16(m128Tmp3, + _mm_load_si128( + (__m128i *) (transform16x16_2[1][3])))); + + /* Compute EE0 and EEE */ + + m128Tmp0 = _mm_unpacklo_epi16(m128iS8, m128iS24); + EE0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_3[0][0]))); + m128Tmp1 = _mm_unpackhi_epi16(m128iS8, m128iS24); + EE0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_3[0][0]))); + + m128Tmp2 = _mm_unpacklo_epi16(m128iS0, m128iS16); + EEE0l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform16x16_3[1][0]))); + m128Tmp3 = _mm_unpackhi_epi16(m128iS0, m128iS16); + EEE0h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform16x16_3[1][0]))); + + EE1l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_3[0][1]))); + EE1h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_3[0][1]))); + + EEE1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform16x16_3[1][1]))); + EEE1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform16x16_3[1][1]))); + + /* Compute EE */ + + EE2l = _mm_sub_epi32(EEE1l, EE1l); + EE3l = _mm_sub_epi32(EEE0l, EE0l); + EE2h = _mm_sub_epi32(EEE1h, EE1h); + EE3h = _mm_sub_epi32(EEE0h, EE0h); + + EE0l = _mm_add_epi32(EEE0l, EE0l); + EE1l = _mm_add_epi32(EEE1l, EE1l); + EE0h = _mm_add_epi32(EEE0h, EE0h); + EE1h = _mm_add_epi32(EEE1h, EE1h); + /**/ + + EE7l = _mm_sub_epi32(EE0l, E00l); + EE6l = _mm_sub_epi32(EE1l, E01l); + EE5l = _mm_sub_epi32(EE2l, E02l); + EE4l = _mm_sub_epi32(EE3l, E03l); + + EE7h = _mm_sub_epi32(EE0h, E00h); + EE6h = _mm_sub_epi32(EE1h, E01h); + EE5h = _mm_sub_epi32(EE2h, E02h); + EE4h = _mm_sub_epi32(EE3h, E03h); + + EE0l = _mm_add_epi32(EE0l, E00l); + EE1l = _mm_add_epi32(EE1l, E01l); + EE2l = _mm_add_epi32(EE2l, E02l); + EE3l = _mm_add_epi32(EE3l, E03l); + + EE0h = _mm_add_epi32(EE0h, E00h); + EE1h = _mm_add_epi32(EE1h, E01h); + EE2h = _mm_add_epi32(EE2h, E02h); + EE3h = _mm_add_epi32(EE3h, E03h); + /* Compute E */ + + E15l = _mm_sub_epi32(EE0l, E0l); + E15l = _mm_add_epi32(E15l, m128iAdd); + E14l = _mm_sub_epi32(EE1l, E1l); + E14l = _mm_add_epi32(E14l, m128iAdd); + E13l = _mm_sub_epi32(EE2l, E2l); + E13l = _mm_add_epi32(E13l, m128iAdd); + E12l = _mm_sub_epi32(EE3l, E3l); + E12l = _mm_add_epi32(E12l, m128iAdd); + E11l = _mm_sub_epi32(EE4l, E4l); + E11l = _mm_add_epi32(E11l, m128iAdd); + E10l = _mm_sub_epi32(EE5l, E5l); + E10l = _mm_add_epi32(E10l, m128iAdd); + E9l = _mm_sub_epi32(EE6l, E6l); + E9l = _mm_add_epi32(E9l, m128iAdd); + E8l = _mm_sub_epi32(EE7l, E7l); + E8l = _mm_add_epi32(E8l, m128iAdd); + + E0l = _mm_add_epi32(EE0l, E0l); + E0l = _mm_add_epi32(E0l, m128iAdd); + E1l = _mm_add_epi32(EE1l, E1l); + E1l = _mm_add_epi32(E1l, m128iAdd); + E2l = _mm_add_epi32(EE2l, E2l); + E2l = _mm_add_epi32(E2l, m128iAdd); + E3l = _mm_add_epi32(EE3l, E3l); + E3l = _mm_add_epi32(E3l, m128iAdd); + E4l = _mm_add_epi32(EE4l, E4l); + E4l = _mm_add_epi32(E4l, m128iAdd); + E5l = _mm_add_epi32(EE5l, E5l); + E5l = _mm_add_epi32(E5l, m128iAdd); + E6l = _mm_add_epi32(EE6l, E6l); + E6l = _mm_add_epi32(E6l, m128iAdd); + E7l = _mm_add_epi32(EE7l, E7l); + E7l = _mm_add_epi32(E7l, m128iAdd); + + E15h = _mm_sub_epi32(EE0h, E0h); + E15h = _mm_add_epi32(E15h, m128iAdd); + E14h = _mm_sub_epi32(EE1h, E1h); + E14h = _mm_add_epi32(E14h, m128iAdd); + E13h = _mm_sub_epi32(EE2h, E2h); + E13h = _mm_add_epi32(E13h, m128iAdd); + E12h = _mm_sub_epi32(EE3h, E3h); + E12h = _mm_add_epi32(E12h, m128iAdd); + E11h = _mm_sub_epi32(EE4h, E4h); + E11h = _mm_add_epi32(E11h, m128iAdd); + E10h = _mm_sub_epi32(EE5h, E5h); + E10h = _mm_add_epi32(E10h, m128iAdd); + E9h = _mm_sub_epi32(EE6h, E6h); + E9h = _mm_add_epi32(E9h, m128iAdd); + E8h = _mm_sub_epi32(EE7h, E7h); + E8h = _mm_add_epi32(E8h, m128iAdd); + + E0h = _mm_add_epi32(EE0h, E0h); + E0h = _mm_add_epi32(E0h, m128iAdd); + E1h = _mm_add_epi32(EE1h, E1h); + E1h = _mm_add_epi32(E1h, m128iAdd); + E2h = _mm_add_epi32(EE2h, E2h); + E2h = _mm_add_epi32(E2h, m128iAdd); + E3h = _mm_add_epi32(EE3h, E3h); + E3h = _mm_add_epi32(E3h, m128iAdd); + E4h = _mm_add_epi32(EE4h, E4h); + E4h = _mm_add_epi32(E4h, m128iAdd); + E5h = _mm_add_epi32(EE5h, E5h); + E5h = _mm_add_epi32(E5h, m128iAdd); + E6h = _mm_add_epi32(EE6h, E6h); + E6h = _mm_add_epi32(E6h, m128iAdd); + E7h = _mm_add_epi32(EE7h, E7h); + E7h = _mm_add_epi32(E7h, m128iAdd); + + m128iS0 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift), + _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift)); + m128iS1 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift), + _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift)); + m128iS2 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift), + _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift)); + m128iS3 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift), + _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift)); + m128iS4 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E4l, O4l), shift), + _mm_srai_epi32(_mm_add_epi32(E4h, O4h), shift)); + m128iS5 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E5l, O5l), shift), + _mm_srai_epi32(_mm_add_epi32(E5h, O5h), shift)); + m128iS6 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E6l, O6l), shift), + _mm_srai_epi32(_mm_add_epi32(E6h, O6h), shift)); + m128iS7 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E7l, O7l), shift), + _mm_srai_epi32(_mm_add_epi32(E7h, O7h), shift)); + m128iS8 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E8l, O8l), shift), + _mm_srai_epi32(_mm_add_epi32(E8h, O8h), shift)); + m128iS9 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E9l, O9l), shift), + _mm_srai_epi32(_mm_add_epi32(E9h, O9h), shift)); + m128iS10 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E10l, O10l), shift), + _mm_srai_epi32(_mm_add_epi32(E10h, O10h), shift)); + m128iS11 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E11l, O11l), shift), + _mm_srai_epi32(_mm_add_epi32(E11h, O11h), shift)); + m128iS12 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E12l, O12l), shift), + _mm_srai_epi32(_mm_add_epi32(E12h, O12h), shift)); + m128iS13 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E13l, O13l), shift), + _mm_srai_epi32(_mm_add_epi32(E13h, O13h), shift)); + m128iS14 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E14l, O14l), shift), + _mm_srai_epi32(_mm_add_epi32(E14h, O14h), shift)); + m128iS15 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E15l, O15l), shift), + _mm_srai_epi32(_mm_add_epi32(E15h, O15h), shift)); + + m128iS31 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift), + _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift)); + m128iS30 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift), + _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift)); + m128iS29 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift), + _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift)); + m128iS28 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift), + _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift)); + m128iS27 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E4l, O4l), shift), + _mm_srai_epi32(_mm_sub_epi32(E4h, O4h), shift)); + m128iS26 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E5l, O5l), shift), + _mm_srai_epi32(_mm_sub_epi32(E5h, O5h), shift)); + m128iS25 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E6l, O6l), shift), + _mm_srai_epi32(_mm_sub_epi32(E6h, O6h), shift)); + m128iS24 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E7l, O7l), shift), + _mm_srai_epi32(_mm_sub_epi32(E7h, O7h), shift)); + m128iS23 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E8l, O8l), shift), + _mm_srai_epi32(_mm_sub_epi32(E8h, O8h), shift)); + m128iS22 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E9l, O9l), shift), + _mm_srai_epi32(_mm_sub_epi32(E9h, O9h), shift)); + m128iS21 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E10l, O10l), shift), + _mm_srai_epi32(_mm_sub_epi32(E10h, O10h), shift)); + m128iS20 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E11l, O11l), shift), + _mm_srai_epi32(_mm_sub_epi32(E11h, O11h), shift)); + m128iS19 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E12l, O12l), shift), + _mm_srai_epi32(_mm_sub_epi32(E12h, O12h), shift)); + m128iS18 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E13l, O13l), shift), + _mm_srai_epi32(_mm_sub_epi32(E13h, O13h), shift)); + m128iS17 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E14l, O14l), shift), + _mm_srai_epi32(_mm_sub_epi32(E14h, O14h), shift)); + m128iS16 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E15l, O15l), shift), + _mm_srai_epi32(_mm_sub_epi32(E15h, O15h), shift)); + + if (!j) { + /* Inverse the matrix */ + E0l = _mm_unpacklo_epi16(m128iS0, m128iS16); + E1l = _mm_unpacklo_epi16(m128iS1, m128iS17); + E2l = _mm_unpacklo_epi16(m128iS2, m128iS18); + E3l = _mm_unpacklo_epi16(m128iS3, m128iS19); + E4l = _mm_unpacklo_epi16(m128iS4, m128iS20); + E5l = _mm_unpacklo_epi16(m128iS5, m128iS21); + E6l = _mm_unpacklo_epi16(m128iS6, m128iS22); + E7l = _mm_unpacklo_epi16(m128iS7, m128iS23); + E8l = _mm_unpacklo_epi16(m128iS8, m128iS24); + E9l = _mm_unpacklo_epi16(m128iS9, m128iS25); + E10l = _mm_unpacklo_epi16(m128iS10, m128iS26); + E11l = _mm_unpacklo_epi16(m128iS11, m128iS27); + E12l = _mm_unpacklo_epi16(m128iS12, m128iS28); + E13l = _mm_unpacklo_epi16(m128iS13, m128iS29); + E14l = _mm_unpacklo_epi16(m128iS14, m128iS30); + E15l = _mm_unpacklo_epi16(m128iS15, m128iS31); + + O0l = _mm_unpackhi_epi16(m128iS0, m128iS16); + O1l = _mm_unpackhi_epi16(m128iS1, m128iS17); + O2l = _mm_unpackhi_epi16(m128iS2, m128iS18); + O3l = _mm_unpackhi_epi16(m128iS3, m128iS19); + O4l = _mm_unpackhi_epi16(m128iS4, m128iS20); + O5l = _mm_unpackhi_epi16(m128iS5, m128iS21); + O6l = _mm_unpackhi_epi16(m128iS6, m128iS22); + O7l = _mm_unpackhi_epi16(m128iS7, m128iS23); + O8l = _mm_unpackhi_epi16(m128iS8, m128iS24); + O9l = _mm_unpackhi_epi16(m128iS9, m128iS25); + O10l = _mm_unpackhi_epi16(m128iS10, m128iS26); + O11l = _mm_unpackhi_epi16(m128iS11, m128iS27); + O12l = _mm_unpackhi_epi16(m128iS12, m128iS28); + O13l = _mm_unpackhi_epi16(m128iS13, m128iS29); + O14l = _mm_unpackhi_epi16(m128iS14, m128iS30); + O15l = _mm_unpackhi_epi16(m128iS15, m128iS31); + + E0h = _mm_unpacklo_epi16(E0l, E8l); + E1h = _mm_unpacklo_epi16(E1l, E9l); + E2h = _mm_unpacklo_epi16(E2l, E10l); + E3h = _mm_unpacklo_epi16(E3l, E11l); + E4h = _mm_unpacklo_epi16(E4l, E12l); + E5h = _mm_unpacklo_epi16(E5l, E13l); + E6h = _mm_unpacklo_epi16(E6l, E14l); + E7h = _mm_unpacklo_epi16(E7l, E15l); + + E8h = _mm_unpackhi_epi16(E0l, E8l); + E9h = _mm_unpackhi_epi16(E1l, E9l); + E10h = _mm_unpackhi_epi16(E2l, E10l); + E11h = _mm_unpackhi_epi16(E3l, E11l); + E12h = _mm_unpackhi_epi16(E4l, E12l); + E13h = _mm_unpackhi_epi16(E5l, E13l); + E14h = _mm_unpackhi_epi16(E6l, E14l); + E15h = _mm_unpackhi_epi16(E7l, E15l); + + m128Tmp0 = _mm_unpacklo_epi16(E0h, E4h); + m128Tmp1 = _mm_unpacklo_epi16(E1h, E5h); + m128Tmp2 = _mm_unpacklo_epi16(E2h, E6h); + m128Tmp3 = _mm_unpacklo_epi16(E3h, E7h); + + m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); + m128iS0 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS1 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); + m128iS2 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS3 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp0 = _mm_unpackhi_epi16(E0h, E4h); + m128Tmp1 = _mm_unpackhi_epi16(E1h, E5h); + m128Tmp2 = _mm_unpackhi_epi16(E2h, E6h); + m128Tmp3 = _mm_unpackhi_epi16(E3h, E7h); + + m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); + m128iS4 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS5 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); + m128iS6 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS7 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp0 = _mm_unpacklo_epi16(E8h, E12h); + m128Tmp1 = _mm_unpacklo_epi16(E9h, E13h); + m128Tmp2 = _mm_unpacklo_epi16(E10h, E14h); + m128Tmp3 = _mm_unpacklo_epi16(E11h, E15h); + + m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); + m128iS8 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS9 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); + m128iS10 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS11 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp0 = _mm_unpackhi_epi16(E8h, E12h); + m128Tmp1 = _mm_unpackhi_epi16(E9h, E13h); + m128Tmp2 = _mm_unpackhi_epi16(E10h, E14h); + m128Tmp3 = _mm_unpackhi_epi16(E11h, E15h); + + m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); + m128iS12 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS13 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); + m128iS14 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS15 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + /* */ + E0h = _mm_unpacklo_epi16(O0l, O8l); + E1h = _mm_unpacklo_epi16(O1l, O9l); + E2h = _mm_unpacklo_epi16(O2l, O10l); + E3h = _mm_unpacklo_epi16(O3l, O11l); + E4h = _mm_unpacklo_epi16(O4l, O12l); + E5h = _mm_unpacklo_epi16(O5l, O13l); + E6h = _mm_unpacklo_epi16(O6l, O14l); + E7h = _mm_unpacklo_epi16(O7l, O15l); + + E8h = _mm_unpackhi_epi16(O0l, O8l); + E9h = _mm_unpackhi_epi16(O1l, O9l); + E10h = _mm_unpackhi_epi16(O2l, O10l); + E11h = _mm_unpackhi_epi16(O3l, O11l); + E12h = _mm_unpackhi_epi16(O4l, O12l); + E13h = _mm_unpackhi_epi16(O5l, O13l); + E14h = _mm_unpackhi_epi16(O6l, O14l); + E15h = _mm_unpackhi_epi16(O7l, O15l); + + m128Tmp0 = _mm_unpacklo_epi16(E0h, E4h); + m128Tmp1 = _mm_unpacklo_epi16(E1h, E5h); + m128Tmp2 = _mm_unpacklo_epi16(E2h, E6h); + m128Tmp3 = _mm_unpacklo_epi16(E3h, E7h); + + m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); + m128iS16 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS17 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); + m128iS18 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS19 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp0 = _mm_unpackhi_epi16(E0h, E4h); + m128Tmp1 = _mm_unpackhi_epi16(E1h, E5h); + m128Tmp2 = _mm_unpackhi_epi16(E2h, E6h); + m128Tmp3 = _mm_unpackhi_epi16(E3h, E7h); + + m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); + m128iS20 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS21 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); + m128iS22 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS23 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp0 = _mm_unpacklo_epi16(E8h, E12h); + m128Tmp1 = _mm_unpacklo_epi16(E9h, E13h); + m128Tmp2 = _mm_unpacklo_epi16(E10h, E14h); + m128Tmp3 = _mm_unpacklo_epi16(E11h, E15h); + + m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); + m128iS24 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS25 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); + m128iS26 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS27 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp0 = _mm_unpackhi_epi16(E8h, E12h); + m128Tmp1 = _mm_unpackhi_epi16(E9h, E13h); + m128Tmp2 = _mm_unpackhi_epi16(E10h, E14h); + m128Tmp3 = _mm_unpackhi_epi16(E11h, E15h); + + m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); + m128iS28 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS29 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); + m128iS30 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS31 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + /* */ + _mm_store_si128((__m128i *) (src + i), m128iS0); + _mm_store_si128((__m128i *) (src + 32 + i), m128iS1); + _mm_store_si128((__m128i *) (src + 64 + i), m128iS2); + _mm_store_si128((__m128i *) (src + 96 + i), m128iS3); + _mm_store_si128((__m128i *) (src + 128 + i), m128iS4); + _mm_store_si128((__m128i *) (src + 160 + i), m128iS5); + _mm_store_si128((__m128i *) (src + 192 + i), m128iS6); + _mm_store_si128((__m128i *) (src + 224 + i), m128iS7); + _mm_store_si128((__m128i *) (src + 256 + i), m128iS8); + _mm_store_si128((__m128i *) (src + 288 + i), m128iS9); + _mm_store_si128((__m128i *) (src + 320 + i), m128iS10); + _mm_store_si128((__m128i *) (src + 352 + i), m128iS11); + _mm_store_si128((__m128i *) (src + 384 + i), m128iS12); + _mm_store_si128((__m128i *) (src + 416 + i), m128iS13); + _mm_store_si128((__m128i *) (src + 448 + i), m128iS14); + _mm_store_si128((__m128i *) (src + 480 + i), m128iS15); + _mm_store_si128((__m128i *) (src + 512 + i), m128iS16); + _mm_store_si128((__m128i *) (src + 544 + i), m128iS17); + _mm_store_si128((__m128i *) (src + 576 + i), m128iS18); + _mm_store_si128((__m128i *) (src + 608 + i), m128iS19); + _mm_store_si128((__m128i *) (src + 640 + i), m128iS20); + _mm_store_si128((__m128i *) (src + 672 + i), m128iS21); + _mm_store_si128((__m128i *) (src + 704 + i), m128iS22); + _mm_store_si128((__m128i *) (src + 736 + i), m128iS23); + _mm_store_si128((__m128i *) (src + 768 + i), m128iS24); + _mm_store_si128((__m128i *) (src + 800 + i), m128iS25); + _mm_store_si128((__m128i *) (src + 832 + i), m128iS26); + _mm_store_si128((__m128i *) (src + 864 + i), m128iS27); + _mm_store_si128((__m128i *) (src + 896 + i), m128iS28); + _mm_store_si128((__m128i *) (src + 928 + i), m128iS29); + _mm_store_si128((__m128i *) (src + 960 + i), m128iS30); + _mm_store_si128((__m128i *) (src + 992 + i), m128iS31); + + if (i <= 16) { + int k = i + 8; + m128iS0 = _mm_load_si128((__m128i *) (src + k)); + m128iS1 = _mm_load_si128((__m128i *) (src + 32 + k)); + m128iS2 = _mm_load_si128((__m128i *) (src + 64 + k)); + m128iS3 = _mm_load_si128((__m128i *) (src + 96 + k)); + m128iS4 = _mm_load_si128((__m128i *) (src + 128 + k)); + m128iS5 = _mm_load_si128((__m128i *) (src + 160 + k)); + m128iS6 = _mm_load_si128((__m128i *) (src + 192 + k)); + m128iS7 = _mm_load_si128((__m128i *) (src + 224 + k)); + m128iS8 = _mm_load_si128((__m128i *) (src + 256 + k)); + m128iS9 = _mm_load_si128((__m128i *) (src + 288 + k)); + m128iS10 = _mm_load_si128((__m128i *) (src + 320 + k)); + m128iS11 = _mm_load_si128((__m128i *) (src + 352 + k)); + m128iS12 = _mm_load_si128((__m128i *) (src + 384 + k)); + m128iS13 = _mm_load_si128((__m128i *) (src + 416 + k)); + m128iS14 = _mm_load_si128((__m128i *) (src + 448 + k)); + m128iS15 = _mm_load_si128((__m128i *) (src + 480 + k)); + + m128iS16 = _mm_load_si128((__m128i *) (src + 512 + k)); + m128iS17 = _mm_load_si128((__m128i *) (src + 544 + k)); + m128iS18 = _mm_load_si128((__m128i *) (src + 576 + k)); + m128iS19 = _mm_load_si128((__m128i *) (src + 608 + k)); + m128iS20 = _mm_load_si128((__m128i *) (src + 640 + k)); + m128iS21 = _mm_load_si128((__m128i *) (src + 672 + k)); + m128iS22 = _mm_load_si128((__m128i *) (src + 704 + k)); + m128iS23 = _mm_load_si128((__m128i *) (src + 736 + k)); + m128iS24 = _mm_load_si128((__m128i *) (src + 768 + k)); + m128iS25 = _mm_load_si128((__m128i *) (src + 800 + k)); + m128iS26 = _mm_load_si128((__m128i *) (src + 832 + k)); + m128iS27 = _mm_load_si128((__m128i *) (src + 864 + k)); + m128iS28 = _mm_load_si128((__m128i *) (src + 896 + k)); + m128iS29 = _mm_load_si128((__m128i *) (src + 928 + k)); + m128iS30 = _mm_load_si128((__m128i *) (src + 960 + k)); + m128iS31 = _mm_load_si128((__m128i *) (src + 992 + k)); + } else { + m128iS0 = _mm_load_si128((__m128i *) (src)); + m128iS1 = _mm_load_si128((__m128i *) (src + 128)); + m128iS2 = _mm_load_si128((__m128i *) (src + 256)); + m128iS3 = _mm_load_si128((__m128i *) (src + 384)); + m128iS4 = _mm_loadu_si128((__m128i *) (src + 512)); + m128iS5 = _mm_load_si128((__m128i *) (src + 640)); + m128iS6 = _mm_load_si128((__m128i *) (src + 768)); + m128iS7 = _mm_load_si128((__m128i *) (src + 896)); + m128iS8 = _mm_load_si128((__m128i *) (src + 8)); + m128iS9 = _mm_load_si128((__m128i *) (src + 128 + 8)); + m128iS10 = _mm_load_si128((__m128i *) (src + 256 + 8)); + m128iS11 = _mm_load_si128((__m128i *) (src + 384 + 8)); + m128iS12 = _mm_loadu_si128((__m128i *) (src + 512 + 8)); + m128iS13 = _mm_load_si128((__m128i *) (src + 640 + 8)); + m128iS14 = _mm_load_si128((__m128i *) (src + 768 + 8)); + m128iS15 = _mm_load_si128((__m128i *) (src + 896 + 8)); + m128iS16 = _mm_load_si128((__m128i *) (src + 16)); + m128iS17 = _mm_load_si128((__m128i *) (src + 128 + 16)); + m128iS18 = _mm_load_si128((__m128i *) (src + 256 + 16)); + m128iS19 = _mm_load_si128((__m128i *) (src + 384 + 16)); + m128iS20 = _mm_loadu_si128((__m128i *) (src + 512 + 16)); + m128iS21 = _mm_load_si128((__m128i *) (src + 640 + 16)); + m128iS22 = _mm_load_si128((__m128i *) (src + 768 + 16)); + m128iS23 = _mm_load_si128((__m128i *) (src + 896 + 16)); + m128iS24 = _mm_load_si128((__m128i *) (src + 24)); + m128iS25 = _mm_load_si128((__m128i *) (src + 128 + 24)); + m128iS26 = _mm_load_si128((__m128i *) (src + 256 + 24)); + m128iS27 = _mm_load_si128((__m128i *) (src + 384 + 24)); + m128iS28 = _mm_loadu_si128((__m128i *) (src + 512 + 24)); + m128iS29 = _mm_load_si128((__m128i *) (src + 640 + 24)); + m128iS30 = _mm_load_si128((__m128i *) (src + 768 + 24)); + m128iS31 = _mm_load_si128((__m128i *) (src + 896 + 24)); + shift = shift_2nd; + m128iAdd = _mm_set1_epi32(add_2nd); + } + + } else { + int k, m = 0; + _mm_storeu_si128((__m128i *) (src), m128iS0); + _mm_storeu_si128((__m128i *) (src + 8), m128iS1); + _mm_storeu_si128((__m128i *) (src + 16), m128iS2); + _mm_storeu_si128((__m128i *) (src + 24), m128iS3); + _mm_storeu_si128((__m128i *) (src + 128), m128iS4); + _mm_storeu_si128((__m128i *) (src + 128 + 8), m128iS5); + _mm_storeu_si128((__m128i *) (src + 128 + 16), m128iS6); + _mm_storeu_si128((__m128i *) (src + 128 + 24), m128iS7); + _mm_storeu_si128((__m128i *) (src + 256), m128iS8); + _mm_storeu_si128((__m128i *) (src + 256 + 8), m128iS9); + _mm_storeu_si128((__m128i *) (src + 256 + 16), m128iS10); + _mm_storeu_si128((__m128i *) (src + 256 + 24), m128iS11); + _mm_storeu_si128((__m128i *) (src + 384), m128iS12); + _mm_storeu_si128((__m128i *) (src + 384 + 8), m128iS13); + _mm_storeu_si128((__m128i *) (src + 384 + 16), m128iS14); + _mm_storeu_si128((__m128i *) (src + 384 + 24), m128iS15); + + _mm_storeu_si128((__m128i *) (src + 512), m128iS16); + _mm_storeu_si128((__m128i *) (src + 512 + 8), m128iS17); + _mm_storeu_si128((__m128i *) (src + 512 + 16), m128iS18); + _mm_storeu_si128((__m128i *) (src + 512 + 24), m128iS19); + _mm_storeu_si128((__m128i *) (src + 640), m128iS20); + _mm_storeu_si128((__m128i *) (src + 640 + 8), m128iS21); + _mm_storeu_si128((__m128i *) (src + 640 + 16), m128iS22); + _mm_storeu_si128((__m128i *) (src + 640 + 24), m128iS23); + _mm_storeu_si128((__m128i *) (src + 768), m128iS24); + _mm_storeu_si128((__m128i *) (src + 768 + 8), m128iS25); + _mm_storeu_si128((__m128i *) (src + 768 + 16), m128iS26); + _mm_storeu_si128((__m128i *) (src + 768 + 24), m128iS27); + _mm_storeu_si128((__m128i *) (src + 896), m128iS28); + _mm_storeu_si128((__m128i *) (src + 896 + 8), m128iS29); + _mm_storeu_si128((__m128i *) (src + 896 + 16), m128iS30); + _mm_storeu_si128((__m128i *) (src + 896 + 24), m128iS31); + dst = (uint16_t*) _dst + (i * stride); + for (k = 0; k < 8; k++) { + dst[0] = av_clip_uintp2(dst[0] + src[m],10); + dst[1] = av_clip_uintp2(dst[1] + src[m + 8],10); + dst[2] = av_clip_uintp2(dst[2] + src[m + 16],10); + dst[3] = av_clip_uintp2(dst[3] + src[m + 24],10); + dst[4] = av_clip_uintp2( + dst[4] + src[m + 128],10); + dst[5] = av_clip_uintp2( + dst[5] + src[m + 128 + 8],10); + dst[6] = av_clip_uintp2( + dst[6] + src[m + 128 + 16],10); + dst[7] = av_clip_uintp2( + dst[7] + src[m + 128 + 24],10); + + dst[8] = av_clip_uintp2( + dst[8] + src[m + 256],10); + dst[9] = av_clip_uintp2( + dst[9] + src[m + 256 + 8],10); + dst[10] = av_clip_uintp2( + dst[10] + src[m + 256 + 16],10); + dst[11] = av_clip_uintp2( + dst[11] + src[m + 256 + 24],10); + dst[12] = av_clip_uintp2( + dst[12] + src[m + 384],10); + dst[13] = av_clip_uintp2( + dst[13] + src[m + 384 + 8],10); + dst[14] = av_clip_uintp2( + dst[14] + src[m + 384 + 16],10); + dst[15] = av_clip_uintp2( + dst[15] + src[m + 384 + 24],10); + + dst[16] = av_clip_uintp2( + dst[16] + src[m + 512],10); + dst[17] = av_clip_uintp2( + dst[17] + src[m + 512 + 8],10); + dst[18] = av_clip_uintp2( + dst[18] + src[m + 512 + 16],10); + dst[19] = av_clip_uintp2( + dst[19] + src[m + 512 + 24],10); + dst[20] = av_clip_uintp2( + dst[20] + src[m + 640],10); + dst[21] = av_clip_uintp2( + dst[21] + src[m + 640 + 8],10); + dst[22] = av_clip_uintp2( + dst[22] + src[m + 640 + 16],10); + dst[23] = av_clip_uintp2( + dst[23] + src[m + 640 + 24],10); + + dst[24] = av_clip_uintp2( + dst[24] + src[m + 768],10); + dst[25] = av_clip_uintp2( + dst[25] + src[m + 768 + 8],10); + dst[26] = av_clip_uintp2( + dst[26] + src[m + 768 + 16],10); + dst[27] = av_clip_uintp2( + dst[27] + src[m + 768 + 24],10); + dst[28] = av_clip_uintp2( + dst[28] + src[m + 896],10); + dst[29] = av_clip_uintp2( + dst[29] + src[m + 896 + 8],10); + dst[30] = av_clip_uintp2( + dst[30] + src[m + 896 + 16],10); + dst[31] = av_clip_uintp2( + dst[31] + src[m + 896 + 24],10); + + m += 1; + dst += stride; + } + if (i <= 16) { + int k = (i + 8) * 4; + m128iS0 = _mm_load_si128((__m128i *) (src + k)); + m128iS1 = _mm_load_si128((__m128i *) (src + 128 + k)); + m128iS2 = _mm_load_si128((__m128i *) (src + 256 + k)); + m128iS3 = _mm_load_si128((__m128i *) (src + 384 + k)); + m128iS4 = _mm_loadu_si128((__m128i *) (src + 512 + k)); + m128iS5 = _mm_load_si128((__m128i *) (src + 640 + k)); + m128iS6 = _mm_load_si128((__m128i *) (src + 768 + k)); + m128iS7 = _mm_load_si128((__m128i *) (src + 896 + k)); + m128iS8 = _mm_load_si128((__m128i *) (src + 8 + k)); + m128iS9 = _mm_load_si128((__m128i *) (src + 128 + 8 + k)); + m128iS10 = _mm_load_si128((__m128i *) (src + 256 + 8 + k)); + m128iS11 = _mm_load_si128((__m128i *) (src + 384 + 8 + k)); + m128iS12 = _mm_loadu_si128((__m128i *) (src + 512 + 8 + k)); + m128iS13 = _mm_load_si128((__m128i *) (src + 640 + 8 + k)); + m128iS14 = _mm_load_si128((__m128i *) (src + 768 + 8 + k)); + m128iS15 = _mm_load_si128((__m128i *) (src + 896 + 8 + k)); + m128iS16 = _mm_load_si128((__m128i *) (src + 16 + k)); + m128iS17 = _mm_load_si128((__m128i *) (src + 128 + 16 + k)); + m128iS18 = _mm_load_si128((__m128i *) (src + 256 + 16 + k)); + m128iS19 = _mm_load_si128((__m128i *) (src + 384 + 16 + k)); + m128iS20 = _mm_loadu_si128( + (__m128i *) (src + 512 + 16 + k)); + m128iS21 = _mm_load_si128((__m128i *) (src + 640 + 16 + k)); + m128iS22 = _mm_load_si128((__m128i *) (src + 768 + 16 + k)); + m128iS23 = _mm_load_si128((__m128i *) (src + 896 + 16 + k)); + m128iS24 = _mm_load_si128((__m128i *) (src + 24 + k)); + m128iS25 = _mm_load_si128((__m128i *) (src + 128 + 24 + k)); + m128iS26 = _mm_load_si128((__m128i *) (src + 256 + 24 + k)); + m128iS27 = _mm_load_si128((__m128i *) (src + 384 + 24 + k)); + m128iS28 = _mm_loadu_si128( + (__m128i *) (src + 512 + 24 + k)); + m128iS29 = _mm_load_si128((__m128i *) (src + 640 + 24 + k)); + m128iS30 = _mm_load_si128((__m128i *) (src + 768 + 24 + k)); + m128iS31 = _mm_load_si128((__m128i *) (src + 896 + 24 + k)); + } + } + } + } +} +#endif + diff --git a/x86/sse-dct.h b/x86/sse-dct.h new file mode 100644 index 0000000..bc50ade --- /dev/null +++ b/x86/sse-dct.h @@ -0,0 +1,35 @@ +/* + * H.265 video codec. + * Copyright (c) 2013 openHEVC contributors + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef SSE_DCT_H +#define SSE_DCT_H + +#include +#include + +void ff_hevc_transform_skip_8_sse(uint8_t *_dst, const int16_t *coeffs, ptrdiff_t _stride); +void ff_hevc_transform_4x4_luma_add_8_sse4(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride); +void ff_hevc_transform_4x4_add_8_sse4(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride); +void ff_hevc_transform_8x8_add_8_sse4(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride); +void ff_hevc_transform_16x16_add_8_sse4(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride); +void ff_hevc_transform_32x32_add_8_sse4(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride); + +#endif diff --git a/x86/sse-motion.cc b/x86/sse-motion.cc new file mode 100644 index 0000000..c8c7571 --- /dev/null +++ b/x86/sse-motion.cc @@ -0,0 +1,4971 @@ +/* + * H.265 video codec. + * Copyright (c) 2013 openHEVC contributors + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include +#include +#include // SSSE3 +#if HAVE_SSE4_1 +#include +#endif + +#include "sse-motion.h" +#include "libde265/util.h" + + +ALIGNED_16(const int8_t) epel_filters[7][16] = { + { -2, 58, 10, -2,-2, 58, 10, -2,-2, 58, 10, -2,-2, 58, 10, -2 }, + { -4, 54, 16, -2,-4, 54, 16, -2,-4, 54, 16, -2,-4, 54, 16, -2 }, + { -6, 46, 28, -4,-6, 46, 28, -4,-6, 46, 28, -4,-6, 46, 28, -4 }, + { -4, 36, 36, -4,-4, 36, 36, -4,-4, 36, 36, -4,-4, 36, 36, -4 }, + { -4, 28, 46, -6,-4, 28, 46, -6,-4, 28, 46, -6,-4, 28, 46, -6 }, + { -2, 16, 54, -4,-2, 16, 54, -4,-2, 16, 54, -4,-2, 16, 54, -4 }, + { -2, 10, 58, -2,-2, 10, 58, -2,-2, 10, 58, -2,-2, 10, 58, -2 }, +}; + +static const uint8_t qpel_extra_before[4] = { 0, 3, 3, 2 }; +static const uint8_t qpel_extra_after[4] = { 0, 3, 4, 4 }; +static const uint8_t qpel_extra[4] = { 0, 6, 7, 6 }; + +static const int epel_extra_before = 1; +static const int epel_extra_after = 2; +static const int epel_extra = 3; + +#define MAX_PB_SIZE 64 + +#define MASKMOVE 0 + +void print128(const char* prefix, __m128i r) +{ + unsigned char buf[16]; + + *(__m128i*)buf = r; + + printf("%s ",prefix); + for (int i=0;i<16;i++) + { + if (i>0) { printf(":"); } + printf("%02x", buf[i]); + } + + printf("\n"); +} + + +void printm32(const char* prefix, unsigned char* p) +{ + printf("%s ",prefix); + + for (int i=0;i<4;i++) + { + if (i>0) { printf(":"); } + printf("%02x", p[i]); + } + + printf("\n"); +} + + +#define BIT_DEPTH 8 + +void ff_hevc_put_unweighted_pred_8_sse(uint8_t *_dst, ptrdiff_t dststride, + const int16_t *src, ptrdiff_t srcstride, + int width, int height) { + int x, y; + uint8_t *dst = (uint8_t*) _dst; + __m128i r0, r1, f0; + + f0 = _mm_set1_epi16(32); + + + if(!(width & 15)) + { + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 16) { + r0 = _mm_load_si128((__m128i *) (src+x)); + + r1 = _mm_load_si128((__m128i *) (src+x + 8)); + r0 = _mm_adds_epi16(r0, f0); + + r1 = _mm_adds_epi16(r1, f0); + r0 = _mm_srai_epi16(r0, 6); + r1 = _mm_srai_epi16(r1, 6); + r0 = _mm_packus_epi16(r0, r1); + + _mm_storeu_si128((__m128i *) (dst+x), r0); + } + dst += dststride; + src += srcstride; + } + }else if(!(width & 7)) + { + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 8) { + r0 = _mm_load_si128((__m128i *) (src+x)); + + r0 = _mm_adds_epi16(r0, f0); + + r0 = _mm_srai_epi16(r0, 6); + r0 = _mm_packus_epi16(r0, r0); + + _mm_storel_epi64((__m128i *) (dst+x), r0); + } + dst += dststride; + src += srcstride; + } + }else if(!(width & 3)){ + for (y = 0; y < height; y++) { + for(x = 0;x < width; x+=4){ + r0 = _mm_loadl_epi64((__m128i *) (src+x)); + r0 = _mm_adds_epi16(r0, f0); + + r0 = _mm_srai_epi16(r0, 6); + r0 = _mm_packus_epi16(r0, r0); +#if MASKMOVE + _mm_maskmoveu_si128(r0,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1),(char *) (dst+x)); +#else + //r0 = _mm_shuffle_epi32 (r0, 0x00); + *((uint32_t*)(dst+x)) = _mm_cvtsi128_si32(r0); +#endif + } + dst += dststride; + src += srcstride; + } + }else{ + for (y = 0; y < height; y++) { + for(x = 0;x < width; x+=2){ + r0 = _mm_loadl_epi64((__m128i *) (src+x)); + r0 = _mm_adds_epi16(r0, f0); + + r0 = _mm_srai_epi16(r0, 6); + r0 = _mm_packus_epi16(r0, r0); +#if MASKMOVE + _mm_maskmoveu_si128(r0,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,0,0,-1,-1),(char *) (dst+x)); +#else + *((uint16_t*)(dst+x)) = _mm_cvtsi128_si32(r0); +#endif + } + dst += dststride; + src += srcstride; + } + } + +} + +void ff_hevc_put_unweighted_pred_sse(uint8_t *_dst, ptrdiff_t _dststride, + const int16_t *src, ptrdiff_t srcstride, + int width, int height) { + int x, y; + uint8_t *dst = (uint8_t*) _dst; + ptrdiff_t dststride = _dststride / sizeof(uint8_t); + __m128i r0, r1, f0; + int shift = 14 - BIT_DEPTH; +#if BIT_DEPTH < 14 + int16_t offset = 1 << (shift - 1); +#else + int16_t offset = 0; + +#endif + f0 = _mm_set1_epi16(offset); + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 16) { + r0 = _mm_load_si128((__m128i *) &src[x]); + + r1 = _mm_load_si128((__m128i *) &src[x + 8]); + r0 = _mm_adds_epi16(r0, f0); + + r1 = _mm_adds_epi16(r1, f0); + r0 = _mm_srai_epi16(r0, shift); + r1 = _mm_srai_epi16(r1, shift); + r0 = _mm_packus_epi16(r0, r1); + + _mm_storeu_si128((__m128i *) &dst[x], r0); + } + dst += dststride; + src += srcstride; + } +} + +void ff_hevc_put_weighted_pred_avg_8_sse(uint8_t *_dst, ptrdiff_t dststride, + const int16_t *src1, const int16_t *src2, + ptrdiff_t srcstride, int width, + int height) { + int x, y; + uint8_t *dst = (uint8_t*) _dst; + __m128i r0, r1, f0, r2, r3; + + f0 = _mm_set1_epi16(64); + if(!(width & 15)){ + for (y = 0; y < height; y++) { + + for (x = 0; x < width; x += 16) { + r0 = _mm_load_si128((__m128i *) &src1[x]); + r1 = _mm_load_si128((__m128i *) &src1[x + 8]); + r2 = _mm_load_si128((__m128i *) &src2[x]); + r3 = _mm_load_si128((__m128i *) &src2[x + 8]); + + r0 = _mm_adds_epi16(r0, f0); + r1 = _mm_adds_epi16(r1, f0); + r0 = _mm_adds_epi16(r0, r2); + r1 = _mm_adds_epi16(r1, r3); + r0 = _mm_srai_epi16(r0, 7); + r1 = _mm_srai_epi16(r1, 7); + r0 = _mm_packus_epi16(r0, r1); + + _mm_storeu_si128((__m128i *) (dst + x), r0); + } + dst += dststride; + src1 += srcstride; + src2 += srcstride; + } + }else if(!(width & 7)){ + for (y = 0; y < height; y++) { + for(x=0;x= 1){ + if(!(width & 15)){ + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 16) { + x0 = _mm_load_si128((__m128i *) &src[x]); + x2 = _mm_load_si128((__m128i *) &src[x + 8]); + x1 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0), + _mm_mulhi_epi16(x0, c0)); + x3 = _mm_unpackhi_epi16(_mm_mullo_epi16(x2, c0), + _mm_mulhi_epi16(x2, c0)); + x0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0), + _mm_mulhi_epi16(x0, c0)); + x2 = _mm_unpacklo_epi16(_mm_mullo_epi16(x2, c0), + _mm_mulhi_epi16(x2, c0)); + x0 = _mm_add_epi32(x0, add2); + x1 = _mm_add_epi32(x1, add2); + x2 = _mm_add_epi32(x2, add2); + x3 = _mm_add_epi32(x3, add2); + x0 = _mm_srai_epi32(x0, log2Wd); + x1 = _mm_srai_epi32(x1, log2Wd); + x2 = _mm_srai_epi32(x2, log2Wd); + x3 = _mm_srai_epi32(x3, log2Wd); + x0 = _mm_add_epi32(x0, add); + x1 = _mm_add_epi32(x1, add); + x2 = _mm_add_epi32(x2, add); + x3 = _mm_add_epi32(x3, add); + x0 = _mm_packus_epi32(x0, x1); + x2 = _mm_packus_epi32(x2, x3); + x0 = _mm_packus_epi16(x0, x2); + + _mm_storeu_si128((__m128i *) (dst + x), x0); + + } + dst += dststride; + src += srcstride; + } + }else if(!(width & 7)){ + for (y = 0; y < height; y++) { + for(x=0;x= 1) + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 16) { + x0 = _mm_load_si128((__m128i *) &src[x]); + x2 = _mm_load_si128((__m128i *) &src[x + 8]); + x1 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0), + _mm_mulhi_epi16(x0, c0)); + x3 = _mm_unpackhi_epi16(_mm_mullo_epi16(x2, c0), + _mm_mulhi_epi16(x2, c0)); + x0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0), + _mm_mulhi_epi16(x0, c0)); + x2 = _mm_unpacklo_epi16(_mm_mullo_epi16(x2, c0), + _mm_mulhi_epi16(x2, c0)); + x0 = _mm_add_epi32(x0, add2); + x1 = _mm_add_epi32(x1, add2); + x2 = _mm_add_epi32(x2, add2); + x3 = _mm_add_epi32(x3, add2); + x0 = _mm_srai_epi32(x0, log2Wd); + x1 = _mm_srai_epi32(x1, log2Wd); + x2 = _mm_srai_epi32(x2, log2Wd); + x3 = _mm_srai_epi32(x3, log2Wd); + x0 = _mm_add_epi32(x0, add); + x1 = _mm_add_epi32(x1, add); + x2 = _mm_add_epi32(x2, add); + x3 = _mm_add_epi32(x3, add); + x0 = _mm_packus_epi32(x0, x1); + x2 = _mm_packus_epi32(x2, x3); + x0 = _mm_packus_epi16(x0, x2); + + _mm_storeu_si128((__m128i *) (dst + x), x0); + + } + dst += dststride; + src += srcstride; + } + else + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 16) { + + x0 = _mm_load_si128((__m128i *) &src[x]); + x2 = _mm_load_si128((__m128i *) &src[x + 8]); + x1 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0), + _mm_mulhi_epi16(x0, c0)); + x3 = _mm_unpackhi_epi16(_mm_mullo_epi16(x2, c0), + _mm_mulhi_epi16(x2, c0)); + x0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0), + _mm_mulhi_epi16(x0, c0)); + x2 = _mm_unpacklo_epi16(_mm_mullo_epi16(x2, c0), + _mm_mulhi_epi16(x2, c0)); + + x0 = _mm_add_epi32(x0, add2); + x1 = _mm_add_epi32(x1, add2); + x2 = _mm_add_epi32(x2, add2); + x3 = _mm_add_epi32(x3, add2); + + x0 = _mm_packus_epi32(x0, x1); + x2 = _mm_packus_epi32(x2, x3); + x0 = _mm_packus_epi16(x0, x2); + + _mm_storeu_si128((__m128i *) (dst + x), x0); + + } + dst += dststride; + src += srcstride; + } +} +#endif + +#if HAVE_SSE4_1 +void ff_hevc_weighted_pred_avg_8_sse4(uint8_t denom, int16_t wl0Flag, + int16_t wl1Flag, int16_t ol0Flag, int16_t ol1Flag, + uint8_t *_dst, ptrdiff_t _dststride, + const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride, + int width, int height) { + int shift, shift2; + int log2Wd; + int o0; + int o1; + int x, y; + uint8_t *dst = (uint8_t*) _dst; + ptrdiff_t dststride = _dststride / sizeof(uint8_t); + __m128i x0, x1, x2, x3, r0, r1, r2, r3, c0, c1, c2; + shift = 14 - BIT_DEPTH; + log2Wd = denom + shift; + + o0 = (ol0Flag) * (1 << (BIT_DEPTH - 8)); + o1 = (ol1Flag) * (1 << (BIT_DEPTH - 8)); + shift2 = (log2Wd + 1); + c0 = _mm_set1_epi16(wl0Flag); + c1 = _mm_set1_epi16(wl1Flag); + c2 = _mm_set1_epi32((o0 + o1 + 1) << log2Wd); + + if(!(width & 15)){ + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 16) { + x0 = _mm_load_si128((__m128i *) &src1[x]); + x1 = _mm_load_si128((__m128i *) &src1[x + 8]); + x2 = _mm_load_si128((__m128i *) &src2[x]); + x3 = _mm_load_si128((__m128i *) &src2[x + 8]); + + r0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0), + _mm_mulhi_epi16(x0, c0)); + r1 = _mm_unpacklo_epi16(_mm_mullo_epi16(x1, c0), + _mm_mulhi_epi16(x1, c0)); + r2 = _mm_unpacklo_epi16(_mm_mullo_epi16(x2, c1), + _mm_mulhi_epi16(x2, c1)); + r3 = _mm_unpacklo_epi16(_mm_mullo_epi16(x3, c1), + _mm_mulhi_epi16(x3, c1)); + x0 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0), + _mm_mulhi_epi16(x0, c0)); + x1 = _mm_unpackhi_epi16(_mm_mullo_epi16(x1, c0), + _mm_mulhi_epi16(x1, c0)); + x2 = _mm_unpackhi_epi16(_mm_mullo_epi16(x2, c1), + _mm_mulhi_epi16(x2, c1)); + x3 = _mm_unpackhi_epi16(_mm_mullo_epi16(x3, c1), + _mm_mulhi_epi16(x3, c1)); + r0 = _mm_add_epi32(r0, r2); + r1 = _mm_add_epi32(r1, r3); + r2 = _mm_add_epi32(x0, x2); + r3 = _mm_add_epi32(x1, x3); + + r0 = _mm_add_epi32(r0, c2); + r1 = _mm_add_epi32(r1, c2); + r2 = _mm_add_epi32(r2, c2); + r3 = _mm_add_epi32(r3, c2); + + r0 = _mm_srai_epi32(r0, shift2); + r1 = _mm_srai_epi32(r1, shift2); + r2 = _mm_srai_epi32(r2, shift2); + r3 = _mm_srai_epi32(r3, shift2); + + r0 = _mm_packus_epi32(r0, r2); + r1 = _mm_packus_epi32(r1, r3); + r0 = _mm_packus_epi16(r0, r1); + + _mm_storeu_si128((__m128i *) (dst + x), r0); + + } + dst += dststride; + src1 += srcstride; + src2 += srcstride; + } + }else if(!(width & 7)){ + for (y = 0; y < height; y++) { + for(x=0;x>1; + if(!(width & 7)){ + //x1= _mm_setzero_si128(); + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 8) { + + x2 = _mm_loadu_si128((__m128i *) &src[x]); + x2 = _mm_slli_epi16(x2, 4); //shift 14 - BIT LENGTH + _mm_store_si128((__m128i *) &dst[x], x2); + + } + src += srcstride; + dst += dststride; + } + }else if(!(width & 3)){ + //x1= _mm_setzero_si128(); + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 4) { + + x2 = _mm_loadl_epi64((__m128i *) &src[x]); + x2 = _mm_slli_epi16(x2, 4); //shift 14 - BIT LENGTH + + _mm_storel_epi64((__m128i *) &dst[x], x2); + + } + src += srcstride; + dst += dststride; + } + }else{ + //x1= _mm_setzero_si128(); + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 2) { + + x2 = _mm_loadl_epi64((__m128i *) &src[x]); + x2 = _mm_slli_epi16(x2, 4); //shift 14 - BIT LENGTH + _mm_maskmoveu_si128(x2,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1),(char *) (dst+x)); + } + src += srcstride; + dst += dststride; + } + } + +} +#endif + +void ff_hevc_put_hevc_epel_h_8_sse(int16_t *dst, ptrdiff_t dststride, + const uint8_t *_src, ptrdiff_t _srcstride, + int width, int height, int mx, + int my, int16_t* mcbuffer, int bit_depth) { + int x, y; + const uint8_t *src = (const uint8_t*) _src; + ptrdiff_t srcstride = _srcstride; + const int8_t *filter = epel_filters[mx - 1]; + __m128i r0, bshuffle1, bshuffle2, x1, x2, x3; + int8_t filter_0 = filter[0]; + int8_t filter_1 = filter[1]; + int8_t filter_2 = filter[2]; + int8_t filter_3 = filter[3]; + r0 = _mm_set_epi8(filter_3, filter_2, filter_1, filter_0, filter_3, + filter_2, filter_1, filter_0, filter_3, filter_2, filter_1, + filter_0, filter_3, filter_2, filter_1, filter_0); + bshuffle1 = _mm_set_epi8(6, 5, 4, 3, 5, 4, 3, 2, 4, 3, 2, 1, 3, 2, 1, 0); + + + /* + printf("---IN---SSE\n"); + + int extra_top = 1; + int extra_left = 1; + int extra_right = 2; + int extra_bottom = 2; + + for (int y=-extra_top;y>1; + const int8_t *filter = epel_filters[mx - 1]; + __m128i r0, bshuffle1, bshuffle2, x1, x2, x3, r1; + int8_t filter_0 = filter[0]; + int8_t filter_1 = filter[1]; + int8_t filter_2 = filter[2]; + int8_t filter_3 = filter[3]; + r0 = _mm_set_epi16(filter_3, filter_2, filter_1, + filter_0, filter_3, filter_2, filter_1, filter_0); + bshuffle1 = _mm_set_epi8(9,8,7,6,5,4, 3, 2,7,6,5,4, 3, 2, 1, 0); + + if(!(width & 3)){ + bshuffle2 = _mm_set_epi8(13,12,11,10,9,8,7,6,11,10, 9,8,7,6,5, 4); + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 4) { + + x1 = _mm_loadu_si128((__m128i *) &src[x-1]); + x2 = _mm_shuffle_epi8(x1, bshuffle1); + x3 = _mm_shuffle_epi8(x1, bshuffle2); + + + x2 = _mm_madd_epi16(x2, r0); + x3 = _mm_madd_epi16(x3, r0); + x2 = _mm_hadd_epi32(x2, x3); + x2= _mm_srai_epi32(x2,2); //>> (BIT_DEPTH - 8) + + x2 = _mm_packs_epi32(x2,r0); + //give results back + _mm_storel_epi64((__m128i *) &dst[x], x2); + } + src += srcstride; + dst += dststride; + } + }else{ + r1= _mm_setzero_si128(); + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 2) { + /* load data in register */ + x1 = _mm_loadu_si128((__m128i *) &src[x-1]); + x2 = _mm_shuffle_epi8(x1, bshuffle1); + + /* PMADDUBSW then PMADDW */ + x2 = _mm_madd_epi16(x2, r0); + x2 = _mm_hadd_epi32(x2, r1); + x2= _mm_srai_epi32(x2,2); //>> (BIT_DEPTH - 8) + x2 = _mm_packs_epi32(x2, r1); + /* give results back */ + _mm_maskmoveu_si128(x2,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1),(char *) (dst+x)); + } + src += srcstride; + dst += dststride; + } + } +} +#endif + + +void ff_hevc_put_hevc_epel_v_8_sse(int16_t *dst, ptrdiff_t dststride, + const uint8_t *_src, ptrdiff_t _srcstride, int width, int height, int mx, + int my, int16_t* mcbuffer, int bit_depth) { + int x, y; + __m128i x0, x1, x2, x3, t0, t1, t2, t3, r0, f0, f1, f2, f3, r1; + uint8_t *src = (uint8_t*) _src; + ptrdiff_t srcstride = _srcstride / sizeof(uint8_t); + const int8_t *filter = epel_filters[my - 1]; + int8_t filter_0 = filter[0]; + int8_t filter_1 = filter[1]; + int8_t filter_2 = filter[2]; + int8_t filter_3 = filter[3]; + f0 = _mm_set1_epi16(filter_0); + f1 = _mm_set1_epi16(filter_1); + f2 = _mm_set1_epi16(filter_2); + f3 = _mm_set1_epi16(filter_3); + + if(!(width & 15)){ + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 16) { + /* check if memory needs to be reloaded */ + + x0 = _mm_loadu_si128((__m128i *) &src[x - srcstride]); + x1 = _mm_loadu_si128((__m128i *) &src[x]); + x2 = _mm_loadu_si128((__m128i *) &src[x + srcstride]); + x3 = _mm_loadu_si128((__m128i *) &src[x + 2 * srcstride]); + + t0 = _mm_unpacklo_epi8(x0, _mm_setzero_si128()); + t1 = _mm_unpacklo_epi8(x1, _mm_setzero_si128()); + t2 = _mm_unpacklo_epi8(x2, _mm_setzero_si128()); + t3 = _mm_unpacklo_epi8(x3, _mm_setzero_si128()); + + x0 = _mm_unpackhi_epi8(x0, _mm_setzero_si128()); + x1 = _mm_unpackhi_epi8(x1, _mm_setzero_si128()); + x2 = _mm_unpackhi_epi8(x2, _mm_setzero_si128()); + x3 = _mm_unpackhi_epi8(x3, _mm_setzero_si128()); + + /* multiply by correct value : */ + r0 = _mm_mullo_epi16(t0, f0); + r1 = _mm_mullo_epi16(x0, f0); + r0 = _mm_adds_epi16(r0, _mm_mullo_epi16(t1, f1)); + r1 = _mm_adds_epi16(r1, _mm_mullo_epi16(x1, f1)); + r0 = _mm_adds_epi16(r0, _mm_mullo_epi16(t2, f2)); + r1 = _mm_adds_epi16(r1, _mm_mullo_epi16(x2, f2)); + r0 = _mm_adds_epi16(r0, _mm_mullo_epi16(t3, f3)); + r1 = _mm_adds_epi16(r1, _mm_mullo_epi16(x3, f3)); + /* give results back */ + _mm_store_si128((__m128i *) &dst[x], r0); + _mm_storeu_si128((__m128i *) &dst[x + 8], r1); + } + src += srcstride; + dst += dststride; + } + }else if(!(width & 7)){ + r1= _mm_setzero_si128(); + for (y = 0; y < height; y++) { + for(x=0;x>1; + const int8_t *filter = epel_filters[my - 1]; + int8_t filter_0 = filter[0]; + int8_t filter_1 = filter[1]; + int8_t filter_2 = filter[2]; + int8_t filter_3 = filter[3]; + f0 = _mm_set1_epi16(filter_0); + f1 = _mm_set1_epi16(filter_1); + f2 = _mm_set1_epi16(filter_2); + f3 = _mm_set1_epi16(filter_3); + + if(!(width & 7)){ + r1= _mm_setzero_si128(); + for (y = 0; y < height; y++) { + for(x=0;x> (BIT_DEPTH - 8) + t0= _mm_srai_epi32(t0,2);//>> (BIT_DEPTH - 8) + + r0= _mm_packs_epi32(r0, t0); + // give results back + _mm_storeu_si128((__m128i *) &dst[x], r0); + } + src += srcstride; + dst += dststride; + } + }else if(!(width & 3)){ + r1= _mm_setzero_si128(); + for (y = 0; y < height; y++) { + for(x=0;x> (BIT_DEPTH - 8) + + r0= _mm_packs_epi32(r0, r0); + + // give results back + _mm_storel_epi64((__m128i *) &dst[x], r0); + } + src += srcstride; + dst += dststride; + } + }else{ + r1= _mm_setzero_si128(); + for (y = 0; y < height; y++) { + for(x=0;x> (BIT_DEPTH - 8) + + r0= _mm_packs_epi32(r0, r0); + + /* give results back */ + _mm_maskmoveu_si128(r0,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1),(char *) (dst+x)); + + } + src += srcstride; + dst += dststride; + } + } +} +#endif + +void ff_hevc_put_hevc_epel_hv_8_sse(int16_t *dst, ptrdiff_t dststride, + const uint8_t *_src, ptrdiff_t _srcstride, int width, int height, int mx, + int my, int16_t* mcbuffer, int bit_depth) { + int x, y; + uint8_t *src = (uint8_t*) _src; + ptrdiff_t srcstride = _srcstride; + const int8_t *filter_h = epel_filters[mx - 1]; + const int8_t *filter_v = epel_filters[my - 1]; + __m128i r0, bshuffle1, bshuffle2, x0, x1, x2, x3, t0, t1, t2, t3, f0, f1, + f2, f3, r1, r2; + int8_t filter_0 = filter_h[0]; + int8_t filter_1 = filter_h[1]; + int8_t filter_2 = filter_h[2]; + int8_t filter_3 = filter_h[3]; + int16_t *tmp = mcbuffer; + r0 = _mm_set_epi8(filter_3, filter_2, filter_1, filter_0, filter_3, + filter_2, filter_1, filter_0, filter_3, filter_2, filter_1, + filter_0, filter_3, filter_2, filter_1, filter_0); + bshuffle1 = _mm_set_epi8(6, 5, 4, 3, 5, 4, 3, 2, 4, 3, 2, 1, 3, 2, 1, 0); + + src -= epel_extra_before * srcstride; + + f3 = _mm_set1_epi16(filter_v[3]); + f1 = _mm_set1_epi16(filter_v[1]); + f2 = _mm_set1_epi16(filter_v[2]); + f0 = _mm_set1_epi16(filter_v[0]); + + /* horizontal treatment */ + if(!(width & 7)){ + bshuffle2 = _mm_set_epi8(10, 9, 8, 7, 9, 8, 7, 6, 8, 7, 6, 5, 7, 6, 5, + 4); + for (y = 0; y < height + epel_extra; y++) { + for (x = 0; x < width; x += 8) { + + x1 = _mm_loadu_si128((__m128i *) &src[x - 1]); + x2 = _mm_shuffle_epi8(x1, bshuffle1); + x3 = _mm_shuffle_epi8(x1, bshuffle2); + + /* PMADDUBSW then PMADDW */ + x2 = _mm_maddubs_epi16(x2, r0); + x3 = _mm_maddubs_epi16(x3, r0); + x2 = _mm_hadd_epi16(x2, x3); + _mm_store_si128((__m128i *) &tmp[x], x2); + } + src += srcstride; + tmp += MAX_PB_SIZE; + } + tmp = mcbuffer + epel_extra_before * MAX_PB_SIZE; + + /* vertical treatment */ + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 8) { + /* check if memory needs to be reloaded */ + x0 = _mm_load_si128((__m128i *) &tmp[x - MAX_PB_SIZE]); + x1 = _mm_load_si128((__m128i *) &tmp[x]); + x2 = _mm_load_si128((__m128i *) &tmp[x + MAX_PB_SIZE]); + x3 = _mm_load_si128((__m128i *) &tmp[x + 2 * MAX_PB_SIZE]); + + r0 = _mm_mullo_epi16(x0, f0); + r1 = _mm_mulhi_epi16(x0, f0); + r2 = _mm_mullo_epi16(x1, f1); + t0 = _mm_unpacklo_epi16(r0, r1); + x0 = _mm_unpackhi_epi16(r0, r1); + r0 = _mm_mulhi_epi16(x1, f1); + r1 = _mm_mullo_epi16(x2, f2); + t1 = _mm_unpacklo_epi16(r2, r0); + x1 = _mm_unpackhi_epi16(r2, r0); + r2 = _mm_mulhi_epi16(x2, f2); + r0 = _mm_mullo_epi16(x3, f3); + t2 = _mm_unpacklo_epi16(r1, r2); + x2 = _mm_unpackhi_epi16(r1, r2); + r1 = _mm_mulhi_epi16(x3, f3); + t3 = _mm_unpacklo_epi16(r0, r1); + x3 = _mm_unpackhi_epi16(r0, r1); + + /* multiply by correct value : */ + r0 = _mm_add_epi32(t0, t1); + r1 = _mm_add_epi32(x0, x1); + r0 = _mm_add_epi32(r0, t2); + r1 = _mm_add_epi32(r1, x2); + r0 = _mm_add_epi32(r0, t3); + r1 = _mm_add_epi32(r1, x3); + r0 = _mm_srai_epi32(r0, 6); + r1 = _mm_srai_epi32(r1, 6); + + /* give results back */ + r0 = _mm_packs_epi32(r0, r1); + _mm_store_si128((__m128i *) &dst[x], r0); + } + tmp += MAX_PB_SIZE; + dst += dststride; + } + }else if(!(width & 3)){ + for (y = 0; y < height + epel_extra; y ++) { + for(x=0;x>1; + const int8_t *filter_h = epel_filters[mx - 1]; + const int8_t *filter_v = epel_filters[my - 1]; + __m128i r0, bshuffle1, bshuffle2, x0, x1, x2, x3, t0, t1, t2, t3, f0, f1, + f2, f3, r1, r2, r3; + int8_t filter_0 = filter_h[0]; + int8_t filter_1 = filter_h[1]; + int8_t filter_2 = filter_h[2]; + int8_t filter_3 = filter_h[3]; + int16_t *tmp = mcbuffer; + + r0 = _mm_set_epi16(filter_3, filter_2, filter_1, + filter_0, filter_3, filter_2, filter_1, filter_0); + bshuffle1 = _mm_set_epi8(9,8,7,6,5,4, 3, 2,7,6,5,4, 3, 2, 1, 0); + + src -= epel_extra_before * srcstride; + + f0 = _mm_set1_epi16(filter_v[0]); + f1 = _mm_set1_epi16(filter_v[1]); + f2 = _mm_set1_epi16(filter_v[2]); + f3 = _mm_set1_epi16(filter_v[3]); + + + /* horizontal treatment */ + if(!(width & 3)){ + bshuffle2 = _mm_set_epi8(13,12,11,10,9,8,7,6,11,10, 9,8,7,6,5, 4); + for (y = 0; y < height + epel_extra; y ++) { + for(x=0;x> (BIT_DEPTH - 8) + + x2 = _mm_packs_epi32(x2,r0); + //give results back + _mm_storel_epi64((__m128i *) &tmp[x], x2); + + } + src += srcstride; + tmp += MAX_PB_SIZE; + } + tmp = mcbuffer + epel_extra_before * MAX_PB_SIZE; + + // vertical treatment + + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 4) { + x0 = _mm_loadl_epi64((__m128i *) &tmp[x - MAX_PB_SIZE]); + x1 = _mm_loadl_epi64((__m128i *) &tmp[x]); + x2 = _mm_loadl_epi64((__m128i *) &tmp[x + MAX_PB_SIZE]); + x3 = _mm_loadl_epi64((__m128i *) &tmp[x + 2 * MAX_PB_SIZE]); + + r0 = _mm_mullo_epi16(x0, f0); + r1 = _mm_mulhi_epi16(x0, f0); + r2 = _mm_mullo_epi16(x1, f1); + t0 = _mm_unpacklo_epi16(r0, r1); + + r0 = _mm_mulhi_epi16(x1, f1); + r1 = _mm_mullo_epi16(x2, f2); + t1 = _mm_unpacklo_epi16(r2, r0); + + r2 = _mm_mulhi_epi16(x2, f2); + r0 = _mm_mullo_epi16(x3, f3); + t2 = _mm_unpacklo_epi16(r1, r2); + + r1 = _mm_mulhi_epi16(x3, f3); + t3 = _mm_unpacklo_epi16(r0, r1); + + + + r0 = _mm_add_epi32(t0, t1); + r0 = _mm_add_epi32(r0, t2); + r0 = _mm_add_epi32(r0, t3); + r0 = _mm_srai_epi32(r0, 6); + + // give results back + r0 = _mm_packs_epi32(r0, r0); + _mm_storel_epi64((__m128i *) &dst[x], r0); + } + tmp += MAX_PB_SIZE; + dst += dststride; + } + }else{ + bshuffle2=_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1); + r1= _mm_setzero_si128(); + for (y = 0; y < height + epel_extra; y ++) { + for(x=0;x> (BIT_DEPTH - 8) + x2 = _mm_packs_epi32(x2, r1); + /* give results back */ + _mm_maskmoveu_si128(x2,bshuffle2,(char *) (tmp+x)); + } + src += srcstride; + tmp += MAX_PB_SIZE; + } + + tmp = mcbuffer + epel_extra_before * MAX_PB_SIZE; + + /* vertical treatment */ + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 2) { + /* check if memory needs to be reloaded */ + x0 = _mm_loadl_epi64((__m128i *) &tmp[x - MAX_PB_SIZE]); + x1 = _mm_loadl_epi64((__m128i *) &tmp[x]); + x2 = _mm_loadl_epi64((__m128i *) &tmp[x + MAX_PB_SIZE]); + x3 = _mm_loadl_epi64((__m128i *) &tmp[x + 2 * MAX_PB_SIZE]); + + r0 = _mm_mullo_epi16(x0, f0); + t0 = _mm_mulhi_epi16(x0, f0); + + x0= _mm_unpacklo_epi16(r0,t0); + + r1 = _mm_mullo_epi16(x1, f1); + t1 = _mm_mulhi_epi16(x1, f1); + + x1= _mm_unpacklo_epi16(r1,t1); + + r2 = _mm_mullo_epi16(x2, f2); + t2 = _mm_mulhi_epi16(x2, f2); + + x2= _mm_unpacklo_epi16(r2,t2); + + r3 = _mm_mullo_epi16(x3, f3); + t3 = _mm_mulhi_epi16(x3, f3); + + x3= _mm_unpacklo_epi16(r3,t3); + + r0= _mm_add_epi32(x0,x1); + r1= _mm_add_epi32(x2,x3); + r0= _mm_add_epi32(r0,r1); + r0 = _mm_srai_epi32(r0, 6); + /* give results back */ + r0 = _mm_packs_epi32(r0, r0); + _mm_maskmoveu_si128(r0,bshuffle2,(char *) (dst+x)); + } + tmp += MAX_PB_SIZE; + dst += dststride; + } + } +} +#endif + +void ff_hevc_put_hevc_qpel_pixels_8_sse(int16_t *dst, ptrdiff_t dststride, + const uint8_t *_src, ptrdiff_t _srcstride, int width, int height, + int16_t* mcbuffer) { + int x, y; + __m128i x1, x2, x3, x0; + uint8_t *src = (uint8_t*) _src; + ptrdiff_t srcstride = _srcstride; + x0= _mm_setzero_si128(); + if(!(width & 15)){ + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 16) { + + x1 = _mm_loadu_si128((__m128i *) &src[x]); + x2 = _mm_unpacklo_epi8(x1, x0); + + x3 = _mm_unpackhi_epi8(x1, x0); + + x2 = _mm_slli_epi16(x2, 6); + x3 = _mm_slli_epi16(x3, 6); + _mm_storeu_si128((__m128i *) &dst[x], x2); + _mm_storeu_si128((__m128i *) &dst[x + 8], x3); + + } + src += srcstride; + dst += dststride; + } + }else if(!(width & 7)){ + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 8) { + + x1 = _mm_loadu_si128((__m128i *) &src[x]); + x2 = _mm_unpacklo_epi8(x1, x0); + x2 = _mm_slli_epi16(x2, 6); + _mm_storeu_si128((__m128i *) &dst[x], x2); + + } + src += srcstride; + dst += dststride; + } + }else if(!(width & 3)){ + for (y = 0; y < height; y++) { + for(x=0;x>1; + if(!(width & 7)){ + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 8) { + + x1 = _mm_loadu_si128((__m128i *) &src[x]); + x2 = _mm_slli_epi16(x1, 4); //14-BIT DEPTH + _mm_storeu_si128((__m128i *) &dst[x], x2); + + } + src += srcstride; + dst += dststride; + } + }else if(!(width & 3)){ + for (y = 0; y < height; y++) { + for(x=0;x>1; + __m128i x0, x1, x2, x3, r0; + + r0 = _mm_set_epi16(0, 1, -5, 17, 58, -10, 4, -1); + x0= _mm_setzero_si128(); + x3= _mm_set_epi32(0,0,0,-1); + for (y = 0; y < height; y ++) { + for(x=0;x>BIT_DEPTH-8 + x1= _mm_packs_epi32(x1,x0); + // dst[x]= _mm_extract_epi16(x1,0); + _mm_maskmoveu_si128(x1,x3,(char *) (dst+x)); + } + src += srcstride; + dst += dststride; + } + +} +#endif + + +void ff_hevc_put_hevc_qpel_h_2_8_sse(int16_t *dst, ptrdiff_t dststride, + const uint8_t *_src, ptrdiff_t _srcstride, int width, int height, + int16_t* mcbuffer) { + int x, y; + const uint8_t *src = _src; + ptrdiff_t srcstride = _srcstride / sizeof(uint8_t); + __m128i x1, r0, x2, x3, x4, x5; + + r0 = _mm_set_epi8(-1, 4, -11, 40, 40, -11, 4, -1, -1, 4, -11, 40, 40, -11, + 4, -1); + + /* LOAD src from memory to registers to limit memory bandwidth */ + if(!(width - 15)){ + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 8) { + /* load data in register */ + x1 = _mm_loadu_si128((__m128i *) &src[x - 3]); + x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1)); + x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2), + _mm_srli_si128(x1, 3)); + x4 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 4), + _mm_srli_si128(x1, 5)); + x5 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 6), + _mm_srli_si128(x1, 7)); + + /* PMADDUBSW then PMADDW */ + x2 = _mm_maddubs_epi16(x2, r0); + x3 = _mm_maddubs_epi16(x3, r0); + x4 = _mm_maddubs_epi16(x4, r0); + x5 = _mm_maddubs_epi16(x5, r0); + x2 = _mm_hadd_epi16(x2, x3); + x4 = _mm_hadd_epi16(x4, x5); + x2 = _mm_hadd_epi16(x2, x4); + /* give results back */ + _mm_store_si128((__m128i *) &dst[x],x2); + } + src += srcstride; + dst += dststride; + } + + }else{ + + for (y = 0; y < height; y ++) { + for(x=0;x> 1; + __m128i x1, x2, x3, x4, x5, x6, x7, r1; + __m128i t1, t2, t3, t4, t5, t6, t7, t8; + + t7= _mm_set1_epi32(1); + t6= _mm_set1_epi32(-5); + t5= _mm_set1_epi32(17); + t4= _mm_set1_epi32(58); + t3= _mm_set1_epi32(-10); + t2= _mm_set1_epi32(4); + t1= _mm_set1_epi32(-1); + t8= _mm_setzero_si128(); + + for (y = 0; y < height; y ++) { + for(x=0;x> 1; + __m128i x1, x2, x3, x4, x5, x6, x7, x8, r0, r1, r2; + __m128i t1, t2, t3, t4, t5, t6, t7, t8; + r1 = _mm_set_epi16(-1, 4, -11, 40, 40, -11, 4, -1); + + t1= _mm_set1_epi32(-1); + t2= _mm_set1_epi32(4); + t3= _mm_set1_epi32(-11); + t4= _mm_set1_epi32(40); + t5= _mm_set1_epi32(40); + t6= _mm_set1_epi32(-11); + t7= _mm_set1_epi32(4); + t8= _mm_set1_epi32(-1); + + { + x = 0; + r0 = _mm_setzero_si128(); + for (y = 0; y < height; y ++) { + for(x=0;x> 1; + __m128i x1, x2, x3, x4, x5, x6, x7, r0; + __m128i t1, t2, t3, t4, t5, t6, t7, t8; + + t7 = _mm_set1_epi32(-1); + t6 = _mm_set1_epi32(4); + t5 = _mm_set1_epi32(-10); + t4 = _mm_set1_epi32(58); + t3 = _mm_set1_epi32(17); + t2 = _mm_set1_epi32(-5); + t1 = _mm_set1_epi32(1); + t8= _mm_setzero_si128(); + { + + for (y = 0; y < height; y ++) { + for(x=0;x + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef SSE_MOTION_H +#define SSE_MOTION_H + +#include +#include + + +void ff_hevc_put_unweighted_pred_8_sse(uint8_t *_dst, ptrdiff_t dststride, + const int16_t *src, ptrdiff_t srcstride, + int width, int height); + +void ff_hevc_put_weighted_pred_avg_8_sse(uint8_t *_dst, ptrdiff_t dststride, + const int16_t *src1, const int16_t *src2, + ptrdiff_t srcstride, int width, + int height); + +void ff_hevc_put_hevc_epel_pixels_8_sse(int16_t *dst, ptrdiff_t dststride, + const uint8_t *_src, ptrdiff_t srcstride, + int width, int height, + int mx, int my, int16_t* mcbuffer); +void ff_hevc_put_hevc_epel_h_8_sse(int16_t *dst, ptrdiff_t dststride, + const uint8_t *_src, ptrdiff_t srcstride, + int width, int height, + int mx, int my, int16_t* mcbuffer, int bit_depth); +void ff_hevc_put_hevc_epel_v_8_sse(int16_t *dst, ptrdiff_t dststride, + const uint8_t *_src, ptrdiff_t srcstride, + int width, int height, + int mx, int my, int16_t* mcbuffer, int bit_depth); +void ff_hevc_put_hevc_epel_hv_8_sse(int16_t *dst, ptrdiff_t dststride, + const uint8_t *_src, ptrdiff_t srcstride, + int width, int height, + int mx, int my, int16_t* mcbuffer, int bit_depth); + +void ff_hevc_put_hevc_qpel_pixels_8_sse(int16_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int width, int height, int16_t* mcbuffer); +void ff_hevc_put_hevc_qpel_v_1_8_sse(int16_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int width, int height, int16_t* mcbuffer); +void ff_hevc_put_hevc_qpel_v_2_8_sse(int16_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int width, int height, int16_t* mcbuffer); +void ff_hevc_put_hevc_qpel_v_3_8_sse(int16_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int width, int height, int16_t* mcbuffer); +void ff_hevc_put_hevc_qpel_h_1_8_sse(int16_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int width, int height, int16_t* mcbuffer); +void ff_hevc_put_hevc_qpel_h_1_v_1_sse(int16_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int width, int height, int16_t* mcbuffer); +void ff_hevc_put_hevc_qpel_h_1_v_2_sse(int16_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int width, int height, int16_t* mcbuffer); +void ff_hevc_put_hevc_qpel_h_1_v_3_sse(int16_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int width, int height, int16_t* mcbuffer); +void ff_hevc_put_hevc_qpel_h_2_8_sse(int16_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int width, int height, int16_t* mcbuffer); +void ff_hevc_put_hevc_qpel_h_2_v_1_sse(int16_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int width, int height, int16_t* mcbuffer); +void ff_hevc_put_hevc_qpel_h_2_v_2_sse(int16_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int width, int height, int16_t* mcbuffer); +void ff_hevc_put_hevc_qpel_h_2_v_3_sse(int16_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int width, int height, int16_t* mcbuffer); +void ff_hevc_put_hevc_qpel_h_3_8_sse(int16_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int width, int height, int16_t* mcbuffer); +void ff_hevc_put_hevc_qpel_h_3_v_1_sse(int16_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int width, int height, int16_t* mcbuffer); +void ff_hevc_put_hevc_qpel_h_3_v_2_sse(int16_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int width, int height, int16_t* mcbuffer); +void ff_hevc_put_hevc_qpel_h_3_v_3_sse(int16_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int width, int height, int16_t* mcbuffer); + +#endif diff --git a/x86/sse.cc b/x86/sse.cc new file mode 100644 index 0000000..2ee0f8f --- /dev/null +++ b/x86/sse.cc @@ -0,0 +1,104 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifdef _MSC_VER +#include +#endif + +#include "x86/sse.h" +#include "x86/sse-motion.h" +#include "x86/sse-dct.h" + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#ifdef __GNUC__ +#include +#endif + +void init_acceleration_functions_sse(struct acceleration_functions* accel) +{ + uint32_t ecx=0,edx=0; + +#ifdef _MSC_VER + uint32_t regs[4]; + int a = 1; + + __cpuid((int *)regs, (int)a); + + ecx = regs[2]; + edx = regs[3]; +#else + uint32_t eax,ebx; + __get_cpuid(1, &eax,&ebx,&ecx,&edx); +#endif + + // printf("CPUID EAX=1 -> ECX=%x EDX=%x\n", regs[2], regs[3]); + + //int have_MMX = !!(edx & (1<<23)); + int have_SSE = !!(edx & (1<<25)); + int have_SSE4_1 = !!(ecx & (1<<19)); + + // printf("MMX:%d SSE:%d SSE4_1:%d\n",have_MMX,have_SSE,have_SSE4_1); + + if (have_SSE) { + } + +#if HAVE_SSE4_1 + if (have_SSE4_1) { + accel->put_unweighted_pred_8 = ff_hevc_put_unweighted_pred_8_sse; + accel->put_weighted_pred_avg_8 = ff_hevc_put_weighted_pred_avg_8_sse; + + accel->put_hevc_epel_8 = ff_hevc_put_hevc_epel_pixels_8_sse; + accel->put_hevc_epel_h_8 = ff_hevc_put_hevc_epel_h_8_sse; + accel->put_hevc_epel_v_8 = ff_hevc_put_hevc_epel_v_8_sse; + accel->put_hevc_epel_hv_8 = ff_hevc_put_hevc_epel_hv_8_sse; + + accel->put_hevc_qpel_8[0][0] = ff_hevc_put_hevc_qpel_pixels_8_sse; + accel->put_hevc_qpel_8[0][1] = ff_hevc_put_hevc_qpel_v_1_8_sse; + accel->put_hevc_qpel_8[0][2] = ff_hevc_put_hevc_qpel_v_2_8_sse; + accel->put_hevc_qpel_8[0][3] = ff_hevc_put_hevc_qpel_v_3_8_sse; + accel->put_hevc_qpel_8[1][0] = ff_hevc_put_hevc_qpel_h_1_8_sse; + accel->put_hevc_qpel_8[1][1] = ff_hevc_put_hevc_qpel_h_1_v_1_sse; + accel->put_hevc_qpel_8[1][2] = ff_hevc_put_hevc_qpel_h_1_v_2_sse; + accel->put_hevc_qpel_8[1][3] = ff_hevc_put_hevc_qpel_h_1_v_3_sse; + accel->put_hevc_qpel_8[2][0] = ff_hevc_put_hevc_qpel_h_2_8_sse; + accel->put_hevc_qpel_8[2][1] = ff_hevc_put_hevc_qpel_h_2_v_1_sse; + accel->put_hevc_qpel_8[2][2] = ff_hevc_put_hevc_qpel_h_2_v_2_sse; + accel->put_hevc_qpel_8[2][3] = ff_hevc_put_hevc_qpel_h_2_v_3_sse; + accel->put_hevc_qpel_8[3][0] = ff_hevc_put_hevc_qpel_h_3_8_sse; + accel->put_hevc_qpel_8[3][1] = ff_hevc_put_hevc_qpel_h_3_v_1_sse; + accel->put_hevc_qpel_8[3][2] = ff_hevc_put_hevc_qpel_h_3_v_2_sse; + accel->put_hevc_qpel_8[3][3] = ff_hevc_put_hevc_qpel_h_3_v_3_sse; + + accel->transform_skip_8 = ff_hevc_transform_skip_8_sse; + + // actually, for these two functions, the scalar fallback seems to be faster than the SSE code + //accel->transform_4x4_luma_add_8 = ff_hevc_transform_4x4_luma_add_8_sse4; // SSE-4 only TODO + //accel->transform_4x4_add_8 = ff_hevc_transform_4x4_add_8_sse4; + + accel->transform_add_8[1] = ff_hevc_transform_8x8_add_8_sse4; + accel->transform_add_8[2] = ff_hevc_transform_16x16_add_8_sse4; + accel->transform_add_8[3] = ff_hevc_transform_32x32_add_8_sse4; + } +#endif +} + diff --git a/x86/sse.h b/x86/sse.h new file mode 100644 index 0000000..d4663d0 --- /dev/null +++ b/x86/sse.h @@ -0,0 +1,28 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef DE265_SSE_H +#define DE265_SSE_H + +#include "acceleration.h" + +void init_acceleration_functions_sse(struct acceleration_functions* accel); + +#endif