Packman Build Service PMBS

We truncated the diff of some files because they were too big. If you want to see the full diff for every file, click here.

Changes of Revision 20

x265.changes Changed

@@ -1,4 +1,38 @@
 -------------------------------------------------------------------
+Mon Sep 30 12:34:56 UTC 2024 - olaf@aepfle.de
+
+- Update to version 4.0
+  New features:
+  * Alpha Channel feature.
+  * Screen Content Coding (SCC).
+  * MV-HEVC feature.
+  Enhancements to existing features:
+  * Added support for the VMAF v3.x.
+  API changes
+  * Add command line parameter for Alpha Channel feature :option:`--alpha`.
+  * Add command line parameter for SCC feature :option:`--scc 1`.
+  * Add command line parameters for the MV-HEVC feature
+    :option:`--multiview-config "multiview_config.txt"`.
+  Optimizations
+  * Arm SIMD optimizations: Several time-consuming scalar C
+    functions now have SIMD implementations on Arm platforms.
+    Existing Arm SIMD implementations have also been optimized.
+    These optimizations result in up to 57% faster encoding
+    compared to release 3.6.
+  * Arm SIMD optimizations include use of Armv8.4 DotProd, Armv8.6
+    I8MM, and Armv9 SVE2 instruction set extensions. The following
+    algorithms now have optimized SIMD implementations: SAD, SSE,
+    DCT, SAO, convolution, quantization, intra_planar,
+    intraFilter, intrapred DC and IDCT16x16.
+  Bug fixes
+  * Fix for y4m pipe input broken.
+  * Fix SCC crash on multipass encode.
+  * Fix mcstf when :option:`--bframes` value was less than 5.
+  * Fix lowpass DCT for high bit depth.
+  * Fix issue in default code flow and memory leak.
+  * Fix scc crash on multipass encode.
+
+-------------------------------------------------------------------
 Thu Jun 13 05:58:19 UTC 2024 - Luigi Baldoni <aloisio@gmx.com>
 
 - Update to version 3.6

​x
 
@@ -1,4 +1,38 @@
 -------------------------------------------------------------------
+Mon Sep 30 12:34:56 UTC 2024 - olaf@aepfle.de
+
+- Update to version 4.0
+  New features:
+  * Alpha Channel feature.
+  * Screen Content Coding (SCC).
+  * MV-HEVC feature.
+  Enhancements to existing features:
+  * Added support for the VMAF v3.x.
+  API changes
+  * Add command line parameter for Alpha Channel feature :option:`--alpha`.
+  * Add command line parameter for SCC feature :option:`--scc 1`.
+  * Add command line parameters for the MV-HEVC feature
+    :option:`--multiview-config "multiview_config.txt"`.
+  Optimizations
+  * Arm SIMD optimizations: Several time-consuming scalar C
+    functions now have SIMD implementations on Arm platforms.
+    Existing Arm SIMD implementations have also been optimized.
+    These optimizations result in up to 57% faster encoding
+    compared to release 3.6.
+  * Arm SIMD optimizations include use of Armv8.4 DotProd, Armv8.6
+    I8MM, and Armv9 SVE2 instruction set extensions. The following
+    algorithms now have optimized SIMD implementations: SAD, SSE,
+    DCT, SAO, convolution, quantization, intra_planar,
+    intraFilter, intrapred DC and IDCT16x16.
+  Bug fixes
+  * Fix for y4m pipe input broken.
+  * Fix SCC crash on multipass encode.
+  * Fix mcstf when :option:`--bframes` value was less than 5.
+  * Fix lowpass DCT for high bit depth.
+  * Fix issue in default code flow and memory leak.
+  * Fix scc crash on multipass encode.
+
+-------------------------------------------------------------------
 Thu Jun 13 05:58:19 UTC 2024 - Luigi Baldoni <aloisio@gmx.com>
 
 - Update to version 3.6
​

x265.spec Changed

@@ -17,12 +17,12 @@
 #
 
 
-%define sover   209
+%define sover   212
 %define libname lib%{name}
 %define libsoname %{libname}-%{sover}
-%define uver    3_6
+%define uver    4_0
 Name:           x265
-Version:        3.6
+Version:        4.0
 Release:        0
 Summary:        A free h265/HEVC encoder - encoder binary
 License:        GPL-2.0-or-later
@@ -30,11 +30,20 @@
 URL:            https://bitbucket.org/multicoreware/x265_git
 Source0:        https://bitbucket.org/multicoreware/x265_git/downloads/%{name}_%{version}.tar.gz
 Patch1:         x265.pkgconfig.patch
-Patch2:         x265-fix_enable512.patch
 Patch3:         0001-Fix-arm-flags.patch
 Patch4:         0004-Do-not-build-with-assembly-support-on-arm.patch
-BuildRequires:  cmake >= 2.8.8
+BuildRequires:  cmake
+%if 0%{?suse_version} > 1500
 BuildRequires:  gcc-c++
+%else
+%if 0%{?sle_version} > 150500
+BuildRequires:  gcc13
+BuildRequires:  gcc13-c++
+%else
+BuildRequires:  gcc10
+BuildRequires:  gcc10-c++
+%endif
+%endif
 BuildRequires:  nasm >= 2.13
 BuildRequires:  pkgconfig
 %ifarch x86_64
@@ -73,16 +82,27 @@
 streams.
 
 %prep
-%setup -q -n %{name}_%{version}
-%autopatch -p1
+%autosetup -p1 -n %{name}_%{version}
 
+%build
+test -x "$(type -p gcc)"    && CC="$_"
+test -x "$(type -p g++)"    && CXX="$_"
+test -x "$(type -p gcc-10)" && CC="$_"
+test -x "$(type -p g++-10)" && CXX="$_"
+test -x "$(type -p gcc-13)" && CC="$_"
+test -x "$(type -p g++-13)" && CXX="$_"
+export CC="$(readlink -f ${CC})"
+export CXX="$(readlink -f ${CXX})"
+CFLAGS='%optflags -Wno-misleading-indentation -Wno-unused-parameter -Wno-unused-variable'
+CXXFLAGS='%optflags -Wno-misleading-indentation -Wno-unused-parameter -Wno-unused-variable'
 # set the version by hand
-sed -i "/^include(Version)/d" source/CMakeLists.txt
+sed -i~ "/^include(Version)/d" source/CMakeLists.txt
+diff -u "$_"~ "$_" && exit 1
 # force version number in the soname
-sed -i 's/hdr10plus-shared PROPERTIES OUTPUT_NAME hdr10plus/hdr10plus-shared PROPERTIES OUTPUT_NAME hdr10plus-%{version}/' \
+sed -i~ 's/hdr10plus-shared PROPERTIES OUTPUT_NAME hdr10plus/hdr10plus-shared PROPERTIES OUTPUT_NAME hdr10plus-%{version}/' \
        source/CMakeLists.txt
+diff -u "$_"~ "$_" && exit 1
 
-%build
 SOURCE_DIR="$PWD"/source
 COMMON_FLAGS="-DENABLE_TESTS=OFF -DENABLE_PIC=ON -Wno-dev"
 HIGH_BIT_DEPTH_FLAGS="-DENABLE_CLI=OFF -DENABLE_SHARED=OFF -DEXPORT_C_API=OFF -DHIGH_BIT_DEPTH=ON"

 
@@ -17,12 +17,12 @@
 #
 
 
-%define sover   209
+%define sover   212
 %define libname lib%{name}
 %define libsoname %{libname}-%{sover}
-%define uver    3_6
+%define uver    4_0
 Name:           x265
-Version:        3.6
+Version:        4.0
 Release:        0
 Summary:        A free h265/HEVC encoder - encoder binary
 License:        GPL-2.0-or-later
@@ -30,11 +30,20 @@
 URL:            https://bitbucket.org/multicoreware/x265_git
 Source0:        https://bitbucket.org/multicoreware/x265_git/downloads/%{name}_%{version}.tar.gz
 Patch1:         x265.pkgconfig.patch
-Patch2:         x265-fix_enable512.patch
 Patch3:         0001-Fix-arm-flags.patch
 Patch4:         0004-Do-not-build-with-assembly-support-on-arm.patch
-BuildRequires:  cmake >= 2.8.8
+BuildRequires:  cmake
+%if 0%{?suse_version} > 1500
 BuildRequires:  gcc-c++
+%else
+%if 0%{?sle_version} > 150500
+BuildRequires:  gcc13
+BuildRequires:  gcc13-c++
+%else
+BuildRequires:  gcc10
+BuildRequires:  gcc10-c++
+%endif
+%endif
 BuildRequires:  nasm >= 2.13
 BuildRequires:  pkgconfig
 %ifarch x86_64
@@ -73,16 +82,27 @@
 streams.
 
 %prep
-%setup -q -n %{name}_%{version}
-%autopatch -p1
+%autosetup -p1 -n %{name}_%{version}
 
+%build
+test -x "$(type -p gcc)"    && CC="$_"
+test -x "$(type -p g++)"    && CXX="$_"
+test -x "$(type -p gcc-10)" && CC="$_"
+test -x "$(type -p g++-10)" && CXX="$_"
+test -x "$(type -p gcc-13)" && CC="$_"
+test -x "$(type -p g++-13)" && CXX="$_"
+export CC="$(readlink -f ${CC})"
+export CXX="$(readlink -f ${CXX})"
+CFLAGS='%optflags -Wno-misleading-indentation -Wno-unused-parameter -Wno-unused-variable'
+CXXFLAGS='%optflags -Wno-misleading-indentation -Wno-unused-parameter -Wno-unused-variable'
 # set the version by hand
-sed -i "/^include(Version)/d" source/CMakeLists.txt
+sed -i~ "/^include(Version)/d" source/CMakeLists.txt
+diff -u "$_"~ "$_" && exit 1
 # force version number in the soname
-sed -i 's/hdr10plus-shared PROPERTIES OUTPUT_NAME hdr10plus/hdr10plus-shared PROPERTIES OUTPUT_NAME hdr10plus-%{version}/' \
+sed -i~ 's/hdr10plus-shared PROPERTIES OUTPUT_NAME hdr10plus/hdr10plus-shared PROPERTIES OUTPUT_NAME hdr10plus-%{version}/' \
        source/CMakeLists.txt
+diff -u "$_"~ "$_" && exit 1
 
-%build
 SOURCE_DIR="$PWD"/source
 COMMON_FLAGS="-DENABLE_TESTS=OFF -DENABLE_PIC=ON -Wno-dev"
 HIGH_BIT_DEPTH_FLAGS="-DENABLE_CLI=OFF -DENABLE_SHARED=OFF -DEXPORT_C_API=OFF -DHIGH_BIT_DEPTH=ON"
​

0001-Fix-arm-flags.patch Changed

@@ -6,11 +6,9 @@
  source/CMakeLists.txt | 7 ++-----
  1 file changed, 2 insertions(+), 5 deletions(-)
 
-diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
-index ab5ddfe..eb9b19b 100755
 --- a/source/CMakeLists.txt
 +++ b/source/CMakeLists.txt
-@@ -253,10 +253,7 @@ if(GCC)
+@@ -257,10 +257,7 @@
      elseif(ARM)
          find_package(Neon)
          if(CPU_HAS_NEON)
@@ -20,20 +18,42 @@
 -            set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=vfp -marm)
          endif()
      endif()
- 	if(ARM64 OR CROSS_COMPILE_ARM64)
-@@ -265,13 +262,13 @@ if(GCC)
-         find_package(SVE2)
-         if(CPU_HAS_SVE2 OR CROSS_COMPILE_SVE2)
-             message(STATUS "Found SVE2")
--	        set(ARM_ARGS -O3 -march=armv8-a+sve2 -fPIC -flax-vector-conversions)
-+	        set(ARM_ARGS -fPIC -flax-vector-conversions)
-             add_definitions(-DHAVE_SVE2)
-             add_definitions(-DHAVE_SVE)
-             add_definitions(-DHAVE_NEON) # for NEON c/c++ primitives, as currently there is no implementation that use SVE2
-         elseif(CPU_HAS_SVE OR CROSS_COMPILE_SVE)
-             message(STATUS "Found SVE")
--	        set(ARM_ARGS -O3 -march=armv8-a+sve -fPIC -flax-vector-conversions)
-+	        set(ARM_ARGS -fPIC -flax-vector-conversions)
-             add_definitions(-DHAVE_SVE)
-             add_definitions(-DHAVE_NEON) # for NEON c/c++ primitives, as currently there is no implementation that use SVE
-         elseif(CPU_HAS_NEON)
+     if(ARM64)
+--- a/source/cmake/FindNEON_DOTPROD.cmake
++++ b/source/cmake/FindNEON_DOTPROD.cmake
+@@ -17,5 +17,5 @@
+ endif()
+ 
+ if(has_dot_product)
+-    set(CPU_HAS_NEON_DOTPROD 1)
++    set(CPU_HAS_NEON_DOTPROD 0)
+ endif()
+--- a/source/cmake/FindNEON_I8MM.cmake
++++ b/source/cmake/FindNEON_I8MM.cmake
+@@ -17,5 +17,5 @@
+ endif()
+ 
+ if(has_i8mm)
+-    set(CPU_HAS_NEON_I8MM 1)
++    set(CPU_HAS_NEON_I8MM 0)
+ endif()
+--- a/source/cmake/FindSVE.cmake
++++ b/source/cmake/FindSVE.cmake
+@@ -17,5 +17,5 @@
+ endif()
+ 
+ if(sve_version)
+-    set(CPU_HAS_SVE 1)
++    set(CPU_HAS_SVE 0)
+ endif()
+--- a/source/cmake/FindSVE2.cmake
++++ b/source/cmake/FindSVE2.cmake
+@@ -17,6 +17,6 @@
+ endif()
+ 
+ if(sve2_version)
+-    set(CPU_HAS_SVE 1)
+-    set(CPU_HAS_SVE2 1)
++    set(CPU_HAS_SVE 0)
++    set(CPU_HAS_SVE2 0)
+ endif()

 
@@ -6,11 +6,9 @@
  source/CMakeLists.txt | 7 ++-----
  1 file changed, 2 insertions(+), 5 deletions(-)
 
-diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
-index ab5ddfe..eb9b19b 100755
 --- a/source/CMakeLists.txt
 +++ b/source/CMakeLists.txt
-@@ -253,10 +253,7 @@ if(GCC)
+@@ -257,10 +257,7 @@
      elseif(ARM)
          find_package(Neon)
          if(CPU_HAS_NEON)
@@ -20,20 +18,42 @@
 -            set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=vfp -marm)
          endif()
      endif()
-   if(ARM64 OR CROSS_COMPILE_ARM64)
-@@ -265,13 +262,13 @@ if(GCC)
-         find_package(SVE2)
-         if(CPU_HAS_SVE2 OR CROSS_COMPILE_SVE2)
-             message(STATUS "Found SVE2")
--          set(ARM_ARGS -O3 -march=armv8-a+sve2 -fPIC -flax-vector-conversions)
-+          set(ARM_ARGS -fPIC -flax-vector-conversions)
-             add_definitions(-DHAVE_SVE2)
-             add_definitions(-DHAVE_SVE)
-             add_definitions(-DHAVE_NEON) # for NEON c/c++ primitives, as currently there is no implementation that use SVE2
-         elseif(CPU_HAS_SVE OR CROSS_COMPILE_SVE)
-             message(STATUS "Found SVE")
--          set(ARM_ARGS -O3 -march=armv8-a+sve -fPIC -flax-vector-conversions)
-+          set(ARM_ARGS -fPIC -flax-vector-conversions)
-             add_definitions(-DHAVE_SVE)
-             add_definitions(-DHAVE_NEON) # for NEON c/c++ primitives, as currently there is no implementation that use SVE
-         elseif(CPU_HAS_NEON)
+     if(ARM64)
+--- a/source/cmake/FindNEON_DOTPROD.cmake
++++ b/source/cmake/FindNEON_DOTPROD.cmake
+@@ -17,5 +17,5 @@
+ endif()
+ 
+ if(has_dot_product)
+-    set(CPU_HAS_NEON_DOTPROD 1)
++    set(CPU_HAS_NEON_DOTPROD 0)
+ endif()
+--- a/source/cmake/FindNEON_I8MM.cmake
++++ b/source/cmake/FindNEON_I8MM.cmake
+@@ -17,5 +17,5 @@
+ endif()
+ 
+ if(has_i8mm)
+-    set(CPU_HAS_NEON_I8MM 1)
++    set(CPU_HAS_NEON_I8MM 0)
+ endif()
+--- a/source/cmake/FindSVE.cmake
++++ b/source/cmake/FindSVE.cmake
+@@ -17,5 +17,5 @@
+ endif()
+ 
+ if(sve_version)
+-    set(CPU_HAS_SVE 1)
++    set(CPU_HAS_SVE 0)
+ endif()
+--- a/source/cmake/FindSVE2.cmake
++++ b/source/cmake/FindSVE2.cmake
+@@ -17,6 +17,6 @@
+ endif()
+ 
+ if(sve2_version)
+-    set(CPU_HAS_SVE 1)
+-    set(CPU_HAS_SVE2 1)
++    set(CPU_HAS_SVE 0)
++    set(CPU_HAS_SVE2 0)
+ endif()
​

0004-Do-not-build-with-assembly-support-on-arm.patch Changed

 
@@ -6,11 +6,9 @@
  source/CMakeLists.txt | 9 ---------
  1 file changed, 9 deletions(-)
 
-diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
-index 672cc2d..f112330 100755
 --- a/source/CMakeLists.txt
 +++ b/source/CMakeLists.txt
-@@ -73,15 +73,6 @@ elseif(POWERMATCH GREATER "-1")
+@@ -72,15 +72,6 @@
          add_definitions(-DPPC64=1)
          message(STATUS "Detected POWER PPC64 target processor")
      endif()
@@ -24,5 +22,5 @@
 -    set(ARM 1)
 -    add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=1)
  elseif(ARM64MATCH GREATER "-1")
-     #if(CROSS_COMPILE_ARM64)
-         #message(STATUS "Cross compiling for ARM64 arch")
+     message(STATUS "Detected ARM64 target processor")
+     set(ARM64 1)
​

x265-fix_enable512.patch Deleted

 
@@ -1,26 +0,0 @@
---- a/source/common/cpu.cpp
-+++ b/source/common/cpu.cpp
-@@ -110,6 +110,11 @@ const cpu_name_t cpu_names =
-     { "", 0 },
- };
- 
-+bool detect512()
-+{
-+    return(enable512);
-+}
-+
- #if X265_ARCH_X86
- 
- extern "C" {
-@@ -123,11 +128,6 @@ uint64_t PFX(cpu_xgetbv)(int xcr);
- #pragma warning(disable: 4309) // truncation of constant value
- #endif
- 
--bool detect512()
--{
--    return(enable512);
--}
--
- uint32_t cpu_detect(bool benableavx512 )
- {
- 
​

baselibs.conf Changed

 
@@ -1,1 +1,1 @@
-libx265-209
+libx265-212
​

x265_3.6.tar.gz/source/common/aarch64/ipfilter-common.S Deleted

@@ -1,1436 +0,0 @@
-/*****************************************************************************
- * Copyright (C) 2022-2023 MulticoreWare, Inc
- *
- * Authors: David Chen <david.chen@myais.com.cn>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at license @ x265.com.
- *****************************************************************************/
-
-// This file contains the macros written using NEON instruction set
-// that are also used by the SVE2 functions
-
-// Macros below follow these conventions:
-// - input data in registers: v0, v1, v2, v3, v4, v5, v6, v7
-// - constants in registers: v24, v25, v26, v27, v31
-// - temporary registers: v16, v17, v18, v19, v20, v21, v22, v23, v28, v29, v30.
-// - _32b macros output a result in v17.4s
-// - _64b and _32b_1 macros output results in v17.4s, v18.4s
-
-#include "asm.S"
-
-.arch           armv8-a
-
-#ifdef __APPLE__
-.section __RODATA,__rodata
-#else
-.section .rodata
-#endif
-
-.align 4
-
-.macro vextin8 v
-    ldp             d6, d7, x11, #16
-.if \v == 0
-    // qpel_filter_0 only uses values in v3
-    ext             v3.8b, v6.8b, v7.8b, #4
-.else
-.if \v != 3
-    ext             v0.8b, v6.8b, v7.8b, #1
-.endif
-    ext             v1.8b, v6.8b, v7.8b, #2
-    ext             v2.8b, v6.8b, v7.8b, #3
-    ext             v3.8b, v6.8b, v7.8b, #4
-    ext             v4.8b, v6.8b, v7.8b, #5
-    ext             v5.8b, v6.8b, v7.8b, #6
-    ext             v6.8b, v6.8b, v7.8b, #7
-.endif
-.endm
-
-.macro vextin8_64 v
-    ldp             q6, q7, x11, #32
-.if \v == 0
-    // qpel_filter_0 only uses values in v3
-    ext             v3.16b, v6.16b, v7.16b, #4
-.else
-.if \v != 3
-    // qpel_filter_3 does not use values in v0
-    ext             v0.16b, v6.16b, v7.16b, #1
-.endif
-    ext             v1.16b, v6.16b, v7.16b, #2
-    ext             v2.16b, v6.16b, v7.16b, #3
-    ext             v3.16b, v6.16b, v7.16b, #4
-    ext             v4.16b, v6.16b, v7.16b, #5
-    ext             v5.16b, v6.16b, v7.16b, #6
-.if \v == 1
-    ext             v6.16b, v6.16b, v7.16b, #7
-    // qpel_filter_1 does not use v7
-.else
-    ext             v16.16b, v6.16b, v7.16b, #7
-    ext             v7.16b, v6.16b, v7.16b, #8
-    mov             v6.16b, v16.16b
-.endif
-.endif
-.endm
-
-.macro vextin8_chroma v
-    ldp             d6, d7, x11, #16
-.if \v == 0
-    // qpel_filter_chroma_0 only uses values in v1
-    ext             v1.8b, v6.8b, v7.8b, #2
-.else
-    ext             v0.8b, v6.8b, v7.8b, #1
-    ext             v1.8b, v6.8b, v7.8b, #2
-    ext             v2.8b, v6.8b, v7.8b, #3
-    ext             v3.8b, v6.8b, v7.8b, #4
-.endif
-.endm
-
-.macro vextin8_chroma_64 v
-    ldp             q16, q17, x11, #32
-.if \v == 0
-    // qpel_filter_chroma_0 only uses values in v1
-    ext             v1.16b, v16.16b, v17.16b, #2
-.else
-    ext             v0.16b, v16.16b, v17.16b, #1
-    ext             v1.16b, v16.16b, v17.16b, #2
-    ext             v2.16b, v16.16b, v17.16b, #3
-    ext             v3.16b, v16.16b, v17.16b, #4
-.endif
-.endm
-
-.macro qpel_load_32b v
-.if \v == 0
-    add             x6, x6, x11       // do not load 3 values that are not used in qpel_filter_0
-    ld1             {v3.8b}, x6, x1
-.elseif \v == 1 || \v == 2 || \v == 3
-.if \v != 3                           // not used in qpel_filter_3
-    ld1             {v0.8b}, x6, x1
-.else
-    add             x6, x6, x1
-.endif
-    ld1             {v1.8b}, x6, x1
-    ld1             {v2.8b}, x6, x1
-    ld1             {v3.8b}, x6, x1
-    ld1             {v4.8b}, x6, x1
-    ld1             {v5.8b}, x6, x1
-.if \v != 1                           // not used in qpel_filter_1
-    ld1             {v6.8b}, x6, x1
-    ld1             {v7.8b}, x6
-.else
-    ld1             {v6.8b}, x6
-.endif
-.endif
-.endm
-
-.macro qpel_load_64b v
-.if \v == 0
-    add             x6, x6, x11       // do not load 3 values that are not used in qpel_filter_0
-    ld1             {v3.16b}, x6, x1
-.elseif \v == 1 || \v == 2 || \v == 3
-.if \v != 3                           // not used in qpel_filter_3
-    ld1             {v0.16b}, x6, x1
-.else
-    add             x6, x6, x1
-.endif
-    ld1             {v1.16b}, x6, x1
-    ld1             {v2.16b}, x6, x1
-    ld1             {v3.16b}, x6, x1
-    ld1             {v4.16b}, x6, x1
-    ld1             {v5.16b}, x6, x1
-.if \v != 1                           // not used in qpel_filter_1
-    ld1             {v6.16b}, x6, x1
-    ld1             {v7.16b}, x6
-.else
-    ld1             {v6.16b}, x6
-.endif
-.endif
-.endm
-
-.macro qpel_chroma_load_32b v
-.if \v == 0
-    // qpel_filter_chroma_0 only uses values in v1
-    add             x6, x6, x1
-    ldr             d1, x6
-.else
-    ld1             {v0.8b}, x6, x1
-    ld1             {v1.8b}, x6, x1
-    ld1             {v2.8b}, x6, x1
-    ld1             {v3.8b}, x6
-.endif
-.endm
-
-.macro qpel_chroma_load_64b v
-.if \v == 0
-    // qpel_filter_chroma_0 only uses values in v1
-    add             x6, x6, x1
-    ldr             q1, x6
-.else
-    ld1             {v0.16b}, x6, x1
-    ld1             {v1.16b}, x6, x1
-    ld1             {v2.16b}, x6, x1
-    ld1             {v3.16b}, x6
-.endif
-.endm
-
-//          a, b,   c,  d,  e,   f, g,  h
-// .hword   0, 0,   0, 64,  0,   0, 0,  0
-.macro qpel_start_0
-    movi            v24.16b, #64
-.endm
-
-.macro qpel_filter_0_32b
-    umull           v17.8h, v3.8b, v24.8b    // 64*d
-.endm
-

 
@@ -1,1436 +0,0 @@
-/*****************************************************************************
- * Copyright (C) 2022-2023 MulticoreWare, Inc
- *
- * Authors: David Chen <david.chen@myais.com.cn>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at license @ x265.com.
- *****************************************************************************/
-
-// This file contains the macros written using NEON instruction set
-// that are also used by the SVE2 functions
-
-// Macros below follow these conventions:
-// - input data in registers: v0, v1, v2, v3, v4, v5, v6, v7
-// - constants in registers: v24, v25, v26, v27, v31
-// - temporary registers: v16, v17, v18, v19, v20, v21, v22, v23, v28, v29, v30.
-// - _32b macros output a result in v17.4s
-// - _64b and _32b_1 macros output results in v17.4s, v18.4s
-
-#include "asm.S"
-
-.arch           armv8-a
-
-#ifdef __APPLE__
-.section __RODATA,__rodata
-#else
-.section .rodata
-#endif
-
-.align 4
-
-.macro vextin8 v
-    ldp             d6, d7, x11, #16
-.if \v == 0
-    // qpel_filter_0 only uses values in v3
-    ext             v3.8b, v6.8b, v7.8b, #4
-.else
-.if \v != 3
-    ext             v0.8b, v6.8b, v7.8b, #1
-.endif
-    ext             v1.8b, v6.8b, v7.8b, #2
-    ext             v2.8b, v6.8b, v7.8b, #3
-    ext             v3.8b, v6.8b, v7.8b, #4
-    ext             v4.8b, v6.8b, v7.8b, #5
-    ext             v5.8b, v6.8b, v7.8b, #6
-    ext             v6.8b, v6.8b, v7.8b, #7
-.endif
-.endm
-
-.macro vextin8_64 v
-    ldp             q6, q7, x11, #32
-.if \v == 0
-    // qpel_filter_0 only uses values in v3
-    ext             v3.16b, v6.16b, v7.16b, #4
-.else
-.if \v != 3
-    // qpel_filter_3 does not use values in v0
-    ext             v0.16b, v6.16b, v7.16b, #1
-.endif
-    ext             v1.16b, v6.16b, v7.16b, #2
-    ext             v2.16b, v6.16b, v7.16b, #3
-    ext             v3.16b, v6.16b, v7.16b, #4
-    ext             v4.16b, v6.16b, v7.16b, #5
-    ext             v5.16b, v6.16b, v7.16b, #6
-.if \v == 1
-    ext             v6.16b, v6.16b, v7.16b, #7
-    // qpel_filter_1 does not use v7
-.else
-    ext             v16.16b, v6.16b, v7.16b, #7
-    ext             v7.16b, v6.16b, v7.16b, #8
-    mov             v6.16b, v16.16b
-.endif
-.endif
-.endm
-
-.macro vextin8_chroma v
-    ldp             d6, d7, x11, #16
-.if \v == 0
-    // qpel_filter_chroma_0 only uses values in v1
-    ext             v1.8b, v6.8b, v7.8b, #2
-.else
-    ext             v0.8b, v6.8b, v7.8b, #1
-    ext             v1.8b, v6.8b, v7.8b, #2
-    ext             v2.8b, v6.8b, v7.8b, #3
-    ext             v3.8b, v6.8b, v7.8b, #4
-.endif
-.endm
-
-.macro vextin8_chroma_64 v
-    ldp             q16, q17, x11, #32
-.if \v == 0
-    // qpel_filter_chroma_0 only uses values in v1
-    ext             v1.16b, v16.16b, v17.16b, #2
-.else
-    ext             v0.16b, v16.16b, v17.16b, #1
-    ext             v1.16b, v16.16b, v17.16b, #2
-    ext             v2.16b, v16.16b, v17.16b, #3
-    ext             v3.16b, v16.16b, v17.16b, #4
-.endif
-.endm
-
-.macro qpel_load_32b v
-.if \v == 0
-    add             x6, x6, x11       // do not load 3 values that are not used in qpel_filter_0
-    ld1             {v3.8b}, x6, x1
-.elseif \v == 1 || \v == 2 || \v == 3
-.if \v != 3                           // not used in qpel_filter_3
-    ld1             {v0.8b}, x6, x1
-.else
-    add             x6, x6, x1
-.endif
-    ld1             {v1.8b}, x6, x1
-    ld1             {v2.8b}, x6, x1
-    ld1             {v3.8b}, x6, x1
-    ld1             {v4.8b}, x6, x1
-    ld1             {v5.8b}, x6, x1
-.if \v != 1                           // not used in qpel_filter_1
-    ld1             {v6.8b}, x6, x1
-    ld1             {v7.8b}, x6
-.else
-    ld1             {v6.8b}, x6
-.endif
-.endif
-.endm
-
-.macro qpel_load_64b v
-.if \v == 0
-    add             x6, x6, x11       // do not load 3 values that are not used in qpel_filter_0
-    ld1             {v3.16b}, x6, x1
-.elseif \v == 1 || \v == 2 || \v == 3
-.if \v != 3                           // not used in qpel_filter_3
-    ld1             {v0.16b}, x6, x1
-.else
-    add             x6, x6, x1
-.endif
-    ld1             {v1.16b}, x6, x1
-    ld1             {v2.16b}, x6, x1
-    ld1             {v3.16b}, x6, x1
-    ld1             {v4.16b}, x6, x1
-    ld1             {v5.16b}, x6, x1
-.if \v != 1                           // not used in qpel_filter_1
-    ld1             {v6.16b}, x6, x1
-    ld1             {v7.16b}, x6
-.else
-    ld1             {v6.16b}, x6
-.endif
-.endif
-.endm
-
-.macro qpel_chroma_load_32b v
-.if \v == 0
-    // qpel_filter_chroma_0 only uses values in v1
-    add             x6, x6, x1
-    ldr             d1, x6
-.else
-    ld1             {v0.8b}, x6, x1
-    ld1             {v1.8b}, x6, x1
-    ld1             {v2.8b}, x6, x1
-    ld1             {v3.8b}, x6
-.endif
-.endm
-
-.macro qpel_chroma_load_64b v
-.if \v == 0
-    // qpel_filter_chroma_0 only uses values in v1
-    add             x6, x6, x1
-    ldr             q1, x6
-.else
-    ld1             {v0.16b}, x6, x1
-    ld1             {v1.16b}, x6, x1
-    ld1             {v2.16b}, x6, x1
-    ld1             {v3.16b}, x6
-.endif
-.endm
-
-//          a, b,   c,  d,  e,   f, g,  h
-// .hword   0, 0,   0, 64,  0,   0, 0,  0
-.macro qpel_start_0
-    movi            v24.16b, #64
-.endm
-
-.macro qpel_filter_0_32b
-    umull           v17.8h, v3.8b, v24.8b    // 64*d
-.endm
-
​

x265_3.6.tar.gz/source/common/aarch64/ipfilter-sve2.S Deleted

@@ -1,1282 +0,0 @@
-/*****************************************************************************
- * Copyright (C) 2022-2023 MulticoreWare, Inc
- *
- * Authors: David Chen <david.chen@myais.com.cn>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at license @ x265.com.
- *****************************************************************************/
-
-// Functions in this file:
-// ***** luma_vpp *****
-// ***** luma_vps *****
-// ***** luma_vsp *****
-// ***** luma_vss *****
-// ***** luma_hpp *****
-// ***** luma_hps *****
-// ***** chroma_vpp *****
-// ***** chroma_vps *****
-// ***** chroma_vsp *****
-// ***** chroma_vss *****
-// ***** chroma_hpp *****
-// ***** chroma_hps *****
-
-#include "asm-sve.S"
-#include "ipfilter-common.S"
-
-.arch armv8-a+sve2
-
-#ifdef __APPLE__
-.section __RODATA,__rodata
-#else
-.section .rodata
-#endif
-
-.align 4
-
-.text
-
-.macro qpel_load_32b_sve2 v
-.if \v == 0
-    add             x6, x6, x11       // do not load 3 values that are not used in qpel_filter_0
-    ld1b            {z3.h}, p0/z, x6
-    add             x6, x6, x1
-.elseif \v == 1 || \v == 2 || \v == 3
-.if \v != 3                           // not used in qpel_filter_3
-    ld1b            {z0.h}, p0/z, x6
-    add             x6, x6, x1
-.else
-    add             x6, x6, x1
-.endif
-    ld1b            {z1.h}, p0/z, x6
-    add             x6, x6, x1
-    ld1b            {z2.h}, p0/z, x6
-    add             x6, x6, x1
-    ld1b            {z3.h}, p0/z, x6
-    add             x6, x6, x1
-    ld1b            {z4.h}, p0/z, x6
-    add             x6, x6, x1
-    ld1b            {z5.h}, p0/z, x6
-    add             x6, x6, x1
-.if \v != 1                           // not used in qpel_filter_1
-    ld1b            {z6.h}, p0/z, x6
-    add             x6, x6, x1
-    ld1b            {z7.h}, p0/z, x6
-.else
-    ld1b            {z6.h}, p0/z, x6
-.endif
-.endif
-.endm
-
-.macro qpel_load_64b_sve2_gt_16 v
-.if \v == 0
-    add             x6, x6, x11       // do not load 3 values that are not used in qpel_filter_0
-    ld1b            {z3.h}, p2/z, x6
-    add             x6, x6, x1
-.elseif \v == 1 || \v == 2 || \v == 3
-.if \v != 3                           // not used in qpel_filter_3
-    ld1b            {z0.h}, p2/z, x6
-    add             x6, x6, x1
-.else
-    add             x6, x6, x1
-.endif
-    ld1b            {z1.h}, p2/z, x6
-    add             x6, x6, x1
-    ld1b            {z2.h}, p2/z, x6
-    add             x6, x6, x1
-    ld1b            {z3.h}, p2/z, x6
-    add             x6, x6, x1
-    ld1b            {z4.h}, p2/z, x6
-    add             x6, x6, x1
-    ld1b            {z5.h}, p2/z, x6
-    add             x6, x6, x1
-.if \v != 1                           // not used in qpel_filter_1
-    ld1b            {z6.h}, p2/z, x6
-    add             x6, x6, x1
-    ld1b            {z7.h}, p2/z, x6
-.else
-    ld1b            {z6.h}, p2/z, x6
-.endif
-.endif
-.endm
-
-.macro qpel_chroma_load_32b_sve2 v
-.if \v == 0
-    // qpel_filter_chroma_0 only uses values in v1
-    add             x6, x6, x1
-    ld1b            {z1.h}, p0/z, x6
-.else
-    ld1b            {z0.h}, p0/z, x6
-    add             x6, x6, x1
-    ld1b            {z1.h}, p0/z, x6
-    add             x6, x6, x1
-    ld1b            {z2.h}, p0/z, x6
-    add             x6, x6, x1
-    ld1b            {z3.h}, p0/z, x6
-.endif
-.endm
-
-.macro qpel_start_sve2_0
-    mov             z24.h, #64
-.endm
-
-.macro qpel_filter_sve2_0_32b
-    mul             z17.h, z3.h, z24.h    // 64*d
-.endm
-
-.macro qpel_filter_sve2_0_64b
-    qpel_filter_sve2_0_32b
-    mul             z18.h, z11.h, z24.h
-.endm
-
-.macro qpel_start_sve2_1
-    mov             z24.h, #58
-    mov             z25.h, #10
-    mov             z26.h, #17
-    mov             z27.h, #5
-.endm
-
-.macro qpel_filter_sve2_1_32b
-    mul             z19.h, z2.h, z25.h  // c*10
-    mul             z17.h, z3.h, z24.h  // d*58
-    mul             z21.h, z4.h, z26.h  // e*17
-    mul             z23.h, z5.h, z27.h  // f*5
-    sub             z17.h, z17.h, z19.h // d*58 - c*10
-    lsl             z18.h, z1.h, #2      // b*4
-    add             z17.h, z17.h, z21.h // d*58 - c*10 + e*17
-    sub             z21.h, z6.h, z0.h   // g - a
-    add             z17.h, z17.h, z18.h // d*58 - c*10 + e*17 + b*4
-    sub             z21.h, z21.h, z23.h // g - a - f*5
-    add             z17.h, z17.h, z21.h // d*58 - c*10 + e*17 + b*4 + g - a - f*5
-.endm
-
-.macro qpel_filter_sve2_1_64b
-    qpel_filter_sve2_1_32b
-    mul             z20.h, z10.h, z25.h  // c*10
-    mul             z18.h, z11.h, z24.h  // d*58
-    mul             z21.h, z12.h, z26.h  // e*17
-    mul             z23.h, z13.h, z27.h  // f*5
-    sub             z18.h, z18.h, z20.h   // d*58 - c*10
-    lsl             z28.h, z30.h, #2       // b*4
-    add             z18.h, z18.h, z21.h   // d*58 - c*10 + e*17
-    sub             z21.h, z14.h, z29.h   // g - a
-    add             z18.h, z18.h, z28.h   // d*58 - c*10 + e*17 + b*4
-    sub             z21.h, z21.h, z23.h   // g - a - f*5
-    add             z18.h, z18.h, z21.h   // d*58 - c*10 + e*17 + b*4 + g - a - f*5
-.endm
-
-.macro qpel_start_sve2_2
-    mov             z24.h, #11
-    mov             z25.h, #40
-.endm
-
-.macro qpel_filter_sve2_2_32b
-    add             z17.h, z3.h, z4.h     // d + e
-    add             z19.h, z2.h, z5.h     // c + f
-    add             z23.h, z1.h, z6.h     // b + g
-    add             z21.h, z0.h, z7.h     // a + h
-    mul             z17.h, z17.h, z25.h   // 40 * (d + e)
-    mul             z19.h, z19.h, z24.h   // 11 * (c + f)
-    lsl             z23.h, z23.h, #2       // (b + g) * 4
-    add             z19.h, z19.h, z21.h   // 11 * (c + f) + a + h
-    add             z17.h, z17.h, z23.h   // 40 * (d + e) + (b + g) * 4
-    sub             z17.h, z17.h, z19.h   // 40 * (d + e) + (b + g) * 4 - 11 * (c + f) - a - h
-.endm
-

 
@@ -1,1282 +0,0 @@
-/*****************************************************************************
- * Copyright (C) 2022-2023 MulticoreWare, Inc
- *
- * Authors: David Chen <david.chen@myais.com.cn>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at license @ x265.com.
- *****************************************************************************/
-
-// Functions in this file:
-// ***** luma_vpp *****
-// ***** luma_vps *****
-// ***** luma_vsp *****
-// ***** luma_vss *****
-// ***** luma_hpp *****
-// ***** luma_hps *****
-// ***** chroma_vpp *****
-// ***** chroma_vps *****
-// ***** chroma_vsp *****
-// ***** chroma_vss *****
-// ***** chroma_hpp *****
-// ***** chroma_hps *****
-
-#include "asm-sve.S"
-#include "ipfilter-common.S"
-
-.arch armv8-a+sve2
-
-#ifdef __APPLE__
-.section __RODATA,__rodata
-#else
-.section .rodata
-#endif
-
-.align 4
-
-.text
-
-.macro qpel_load_32b_sve2 v
-.if \v == 0
-    add             x6, x6, x11       // do not load 3 values that are not used in qpel_filter_0
-    ld1b            {z3.h}, p0/z, x6
-    add             x6, x6, x1
-.elseif \v == 1 || \v == 2 || \v == 3
-.if \v != 3                           // not used in qpel_filter_3
-    ld1b            {z0.h}, p0/z, x6
-    add             x6, x6, x1
-.else
-    add             x6, x6, x1
-.endif
-    ld1b            {z1.h}, p0/z, x6
-    add             x6, x6, x1
-    ld1b            {z2.h}, p0/z, x6
-    add             x6, x6, x1
-    ld1b            {z3.h}, p0/z, x6
-    add             x6, x6, x1
-    ld1b            {z4.h}, p0/z, x6
-    add             x6, x6, x1
-    ld1b            {z5.h}, p0/z, x6
-    add             x6, x6, x1
-.if \v != 1                           // not used in qpel_filter_1
-    ld1b            {z6.h}, p0/z, x6
-    add             x6, x6, x1
-    ld1b            {z7.h}, p0/z, x6
-.else
-    ld1b            {z6.h}, p0/z, x6
-.endif
-.endif
-.endm
-
-.macro qpel_load_64b_sve2_gt_16 v
-.if \v == 0
-    add             x6, x6, x11       // do not load 3 values that are not used in qpel_filter_0
-    ld1b            {z3.h}, p2/z, x6
-    add             x6, x6, x1
-.elseif \v == 1 || \v == 2 || \v == 3
-.if \v != 3                           // not used in qpel_filter_3
-    ld1b            {z0.h}, p2/z, x6
-    add             x6, x6, x1
-.else
-    add             x6, x6, x1
-.endif
-    ld1b            {z1.h}, p2/z, x6
-    add             x6, x6, x1
-    ld1b            {z2.h}, p2/z, x6
-    add             x6, x6, x1
-    ld1b            {z3.h}, p2/z, x6
-    add             x6, x6, x1
-    ld1b            {z4.h}, p2/z, x6
-    add             x6, x6, x1
-    ld1b            {z5.h}, p2/z, x6
-    add             x6, x6, x1
-.if \v != 1                           // not used in qpel_filter_1
-    ld1b            {z6.h}, p2/z, x6
-    add             x6, x6, x1
-    ld1b            {z7.h}, p2/z, x6
-.else
-    ld1b            {z6.h}, p2/z, x6
-.endif
-.endif
-.endm
-
-.macro qpel_chroma_load_32b_sve2 v
-.if \v == 0
-    // qpel_filter_chroma_0 only uses values in v1
-    add             x6, x6, x1
-    ld1b            {z1.h}, p0/z, x6
-.else
-    ld1b            {z0.h}, p0/z, x6
-    add             x6, x6, x1
-    ld1b            {z1.h}, p0/z, x6
-    add             x6, x6, x1
-    ld1b            {z2.h}, p0/z, x6
-    add             x6, x6, x1
-    ld1b            {z3.h}, p0/z, x6
-.endif
-.endm
-
-.macro qpel_start_sve2_0
-    mov             z24.h, #64
-.endm
-
-.macro qpel_filter_sve2_0_32b
-    mul             z17.h, z3.h, z24.h    // 64*d
-.endm
-
-.macro qpel_filter_sve2_0_64b
-    qpel_filter_sve2_0_32b
-    mul             z18.h, z11.h, z24.h
-.endm
-
-.macro qpel_start_sve2_1
-    mov             z24.h, #58
-    mov             z25.h, #10
-    mov             z26.h, #17
-    mov             z27.h, #5
-.endm
-
-.macro qpel_filter_sve2_1_32b
-    mul             z19.h, z2.h, z25.h  // c*10
-    mul             z17.h, z3.h, z24.h  // d*58
-    mul             z21.h, z4.h, z26.h  // e*17
-    mul             z23.h, z5.h, z27.h  // f*5
-    sub             z17.h, z17.h, z19.h // d*58 - c*10
-    lsl             z18.h, z1.h, #2      // b*4
-    add             z17.h, z17.h, z21.h // d*58 - c*10 + e*17
-    sub             z21.h, z6.h, z0.h   // g - a
-    add             z17.h, z17.h, z18.h // d*58 - c*10 + e*17 + b*4
-    sub             z21.h, z21.h, z23.h // g - a - f*5
-    add             z17.h, z17.h, z21.h // d*58 - c*10 + e*17 + b*4 + g - a - f*5
-.endm
-
-.macro qpel_filter_sve2_1_64b
-    qpel_filter_sve2_1_32b
-    mul             z20.h, z10.h, z25.h  // c*10
-    mul             z18.h, z11.h, z24.h  // d*58
-    mul             z21.h, z12.h, z26.h  // e*17
-    mul             z23.h, z13.h, z27.h  // f*5
-    sub             z18.h, z18.h, z20.h   // d*58 - c*10
-    lsl             z28.h, z30.h, #2       // b*4
-    add             z18.h, z18.h, z21.h   // d*58 - c*10 + e*17
-    sub             z21.h, z14.h, z29.h   // g - a
-    add             z18.h, z18.h, z28.h   // d*58 - c*10 + e*17 + b*4
-    sub             z21.h, z21.h, z23.h   // g - a - f*5
-    add             z18.h, z18.h, z21.h   // d*58 - c*10 + e*17 + b*4 + g - a - f*5
-.endm
-
-.macro qpel_start_sve2_2
-    mov             z24.h, #11
-    mov             z25.h, #40
-.endm
-
-.macro qpel_filter_sve2_2_32b
-    add             z17.h, z3.h, z4.h     // d + e
-    add             z19.h, z2.h, z5.h     // c + f
-    add             z23.h, z1.h, z6.h     // b + g
-    add             z21.h, z0.h, z7.h     // a + h
-    mul             z17.h, z17.h, z25.h   // 40 * (d + e)
-    mul             z19.h, z19.h, z24.h   // 11 * (c + f)
-    lsl             z23.h, z23.h, #2       // (b + g) * 4
-    add             z19.h, z19.h, z21.h   // 11 * (c + f) + a + h
-    add             z17.h, z17.h, z23.h   // 40 * (d + e) + (b + g) * 4
-    sub             z17.h, z17.h, z19.h   // 40 * (d + e) + (b + g) * 4 - 11 * (c + f) - a - h
-.endm
-
​

x265_3.6.tar.gz/source/common/aarch64/ipfilter.S Deleted

@@ -1,1054 +0,0 @@
-/*****************************************************************************
- * Copyright (C) 2021 MulticoreWare, Inc
- *
- * Authors: Sebastian Pop <spop@amazon.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at license @ x265.com.
- *****************************************************************************/
-
-// Functions in this file:
-// ***** luma_vpp *****
-// ***** luma_vps *****
-// ***** luma_vsp *****
-// ***** luma_vss *****
-// ***** luma_hpp *****
-// ***** luma_hps *****
-// ***** chroma_vpp *****
-// ***** chroma_vps *****
-// ***** chroma_vsp *****
-// ***** chroma_vss *****
-// ***** chroma_hpp *****
-// ***** chroma_hps *****
-
-#include "asm.S"
-#include "ipfilter-common.S"
-
-#ifdef __APPLE__
-.section __RODATA,__rodata
-#else
-.section .rodata
-#endif
-
-.align 4
-
-.text
-
-// ***** luma_vpp *****
-// void interp_vert_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
-.macro LUMA_VPP_4xN h
-function x265_interp_8tap_vert_pp_4x\h\()_neon
-    movrel          x10, g_luma_s16
-    sub             x0, x0, x1
-    sub             x0, x0, x1, lsl #1         // src -= 3 * srcStride
-    lsl             x4, x4, #4
-    ldr             q0, x10, x4              // q0 = luma interpolate coeff
-    dup             v24.8h, v0.h0
-    dup             v25.8h, v0.h1
-    trn1            v24.2d, v24.2d, v25.2d
-    dup             v26.8h, v0.h2
-    dup             v27.8h, v0.h3
-    trn1            v26.2d, v26.2d, v27.2d
-    dup             v28.8h, v0.h4
-    dup             v29.8h, v0.h5
-    trn1            v28.2d, v28.2d, v29.2d
-    dup             v30.8h, v0.h6
-    dup             v31.8h, v0.h7
-    trn1            v30.2d, v30.2d, v31.2d
-
-    // prepare to load 8 lines
-    ld1             {v0.s}0, x0, x1
-    ld1             {v0.s}1, x0, x1
-    ushll           v0.8h, v0.8b, #0
-    ld1             {v1.s}0, x0, x1
-    ld1             {v1.s}1, x0, x1
-    ushll           v1.8h, v1.8b, #0
-    ld1             {v2.s}0, x0, x1
-    ld1             {v2.s}1, x0, x1
-    ushll           v2.8h, v2.8b, #0
-    ld1             {v3.s}0, x0, x1
-    ld1             {v3.s}1, x0, x1
-    ushll           v3.8h, v3.8b, #0
-
-    mov             x9, #\h
-.loop_4x\h:
-    ld1             {v4.s}0, x0, x1
-    ld1             {v4.s}1, x0, x1
-    ushll           v4.8h, v4.8b, #0
-
-    // row0-1
-    mul             v16.8h, v0.8h, v24.8h
-    ext             v21.16b, v0.16b, v1.16b, #8
-    mul             v17.8h, v21.8h, v24.8h
-    mov             v0.16b, v1.16b
-
-    // row2-3
-    mla             v16.8h, v1.8h, v26.8h
-    ext             v21.16b, v1.16b, v2.16b, #8
-    mla             v17.8h, v21.8h, v26.8h
-    mov             v1.16b, v2.16b
-
-    // row4-5
-    mla             v16.8h, v2.8h, v28.8h
-    ext             v21.16b, v2.16b, v3.16b, #8
-    mla             v17.8h, v21.8h, v28.8h
-    mov             v2.16b, v3.16b
-
-    // row6-7
-    mla             v16.8h, v3.8h, v30.8h
-    ext             v21.16b, v3.16b, v4.16b, #8
-    mla             v17.8h, v21.8h, v30.8h
-    mov             v3.16b, v4.16b
-
-    // sum row0-7
-    trn1            v20.2d, v16.2d, v17.2d
-    trn2            v21.2d, v16.2d, v17.2d
-    add             v16.8h, v20.8h, v21.8h
-
-    sqrshrun        v16.8b,  v16.8h,  #6
-    st1             {v16.s}0, x2, x3
-    st1             {v16.s}1, x2, x3
-
-    sub             x9, x9, #2
-    cbnz            x9, .loop_4x\h
-    ret
-endfunc
-.endm
-
-LUMA_VPP_4xN 4
-LUMA_VPP_4xN 8
-LUMA_VPP_4xN 16
-
-// void interp_vert_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
-.macro LUMA_VPP w, h
-function x265_interp_8tap_vert_pp_\w\()x\h\()_neon
-    cmp             x4, #0
-    b.eq            0f
-    cmp             x4, #1
-    b.eq            1f
-    cmp             x4, #2
-    b.eq            2f
-    cmp             x4, #3
-    b.eq            3f
-0:
-    FILTER_LUMA_VPP \w, \h, 0
-1:
-    FILTER_LUMA_VPP \w, \h, 1
-2:
-    FILTER_LUMA_VPP \w, \h, 2
-3:
-    FILTER_LUMA_VPP \w, \h, 3
-endfunc
-.endm
-
-LUMA_VPP 8, 4
-LUMA_VPP 8, 8
-LUMA_VPP 8, 16
-LUMA_VPP 8, 32
-LUMA_VPP 12, 16
-LUMA_VPP 16, 4
-LUMA_VPP 16, 8
-LUMA_VPP 16, 16
-LUMA_VPP 16, 32
-LUMA_VPP 16, 64
-LUMA_VPP 16, 12
-LUMA_VPP 24, 32
-LUMA_VPP 32, 8
-LUMA_VPP 32, 16
-LUMA_VPP 32, 32
-LUMA_VPP 32, 64
-LUMA_VPP 32, 24
-LUMA_VPP 48, 64
-LUMA_VPP 64, 16
-LUMA_VPP 64, 32
-LUMA_VPP 64, 64
-LUMA_VPP 64, 48
-
-// ***** luma_vps *****
-// void interp_vert_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
-.macro LUMA_VPS_4xN h
-function x265_interp_8tap_vert_ps_4x\h\()_neon
-    lsl             x3, x3, #1
-    lsl             x5, x4, #6
-    lsl             x4, x1, #2
-    sub             x4, x4, x1
-    sub             x0, x0, x4
-
-    mov             w6, #8192
-    dup             v28.4s, w6
-    mov             x4, #\h
-    movrel          x12, g_lumaFilter
-    add             x12, x12, x5
-    ld1r            {v16.2d}, x12, #8
-    ld1r            {v17.2d}, x12, #8
-    ld1r            {v18.2d}, x12, #8
-    ld1r            {v19.2d}, x12, #8

 
@@ -1,1054 +0,0 @@
-/*****************************************************************************
- * Copyright (C) 2021 MulticoreWare, Inc
- *
- * Authors: Sebastian Pop <spop@amazon.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at license @ x265.com.
- *****************************************************************************/
-
-// Functions in this file:
-// ***** luma_vpp *****
-// ***** luma_vps *****
-// ***** luma_vsp *****
-// ***** luma_vss *****
-// ***** luma_hpp *****
-// ***** luma_hps *****
-// ***** chroma_vpp *****
-// ***** chroma_vps *****
-// ***** chroma_vsp *****
-// ***** chroma_vss *****
-// ***** chroma_hpp *****
-// ***** chroma_hps *****
-
-#include "asm.S"
-#include "ipfilter-common.S"
-
-#ifdef __APPLE__
-.section __RODATA,__rodata
-#else
-.section .rodata
-#endif
-
-.align 4
-
-.text
-
-// ***** luma_vpp *****
-// void interp_vert_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
-.macro LUMA_VPP_4xN h
-function x265_interp_8tap_vert_pp_4x\h\()_neon
-    movrel          x10, g_luma_s16
-    sub             x0, x0, x1
-    sub             x0, x0, x1, lsl #1         // src -= 3 * srcStride
-    lsl             x4, x4, #4
-    ldr             q0, x10, x4              // q0 = luma interpolate coeff
-    dup             v24.8h, v0.h0
-    dup             v25.8h, v0.h1
-    trn1            v24.2d, v24.2d, v25.2d
-    dup             v26.8h, v0.h2
-    dup             v27.8h, v0.h3
-    trn1            v26.2d, v26.2d, v27.2d
-    dup             v28.8h, v0.h4
-    dup             v29.8h, v0.h5
-    trn1            v28.2d, v28.2d, v29.2d
-    dup             v30.8h, v0.h6
-    dup             v31.8h, v0.h7
-    trn1            v30.2d, v30.2d, v31.2d
-
-    // prepare to load 8 lines
-    ld1             {v0.s}0, x0, x1
-    ld1             {v0.s}1, x0, x1
-    ushll           v0.8h, v0.8b, #0
-    ld1             {v1.s}0, x0, x1
-    ld1             {v1.s}1, x0, x1
-    ushll           v1.8h, v1.8b, #0
-    ld1             {v2.s}0, x0, x1
-    ld1             {v2.s}1, x0, x1
-    ushll           v2.8h, v2.8b, #0
-    ld1             {v3.s}0, x0, x1
-    ld1             {v3.s}1, x0, x1
-    ushll           v3.8h, v3.8b, #0
-
-    mov             x9, #\h
-.loop_4x\h:
-    ld1             {v4.s}0, x0, x1
-    ld1             {v4.s}1, x0, x1
-    ushll           v4.8h, v4.8b, #0
-
-    // row0-1
-    mul             v16.8h, v0.8h, v24.8h
-    ext             v21.16b, v0.16b, v1.16b, #8
-    mul             v17.8h, v21.8h, v24.8h
-    mov             v0.16b, v1.16b
-
-    // row2-3
-    mla             v16.8h, v1.8h, v26.8h
-    ext             v21.16b, v1.16b, v2.16b, #8
-    mla             v17.8h, v21.8h, v26.8h
-    mov             v1.16b, v2.16b
-
-    // row4-5
-    mla             v16.8h, v2.8h, v28.8h
-    ext             v21.16b, v2.16b, v3.16b, #8
-    mla             v17.8h, v21.8h, v28.8h
-    mov             v2.16b, v3.16b
-
-    // row6-7
-    mla             v16.8h, v3.8h, v30.8h
-    ext             v21.16b, v3.16b, v4.16b, #8
-    mla             v17.8h, v21.8h, v30.8h
-    mov             v3.16b, v4.16b
-
-    // sum row0-7
-    trn1            v20.2d, v16.2d, v17.2d
-    trn2            v21.2d, v16.2d, v17.2d
-    add             v16.8h, v20.8h, v21.8h
-
-    sqrshrun        v16.8b,  v16.8h,  #6
-    st1             {v16.s}0, x2, x3
-    st1             {v16.s}1, x2, x3
-
-    sub             x9, x9, #2
-    cbnz            x9, .loop_4x\h
-    ret
-endfunc
-.endm
-
-LUMA_VPP_4xN 4
-LUMA_VPP_4xN 8
-LUMA_VPP_4xN 16
-
-// void interp_vert_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
-.macro LUMA_VPP w, h
-function x265_interp_8tap_vert_pp_\w\()x\h\()_neon
-    cmp             x4, #0
-    b.eq            0f
-    cmp             x4, #1
-    b.eq            1f
-    cmp             x4, #2
-    b.eq            2f
-    cmp             x4, #3
-    b.eq            3f
-0:
-    FILTER_LUMA_VPP \w, \h, 0
-1:
-    FILTER_LUMA_VPP \w, \h, 1
-2:
-    FILTER_LUMA_VPP \w, \h, 2
-3:
-    FILTER_LUMA_VPP \w, \h, 3
-endfunc
-.endm
-
-LUMA_VPP 8, 4
-LUMA_VPP 8, 8
-LUMA_VPP 8, 16
-LUMA_VPP 8, 32
-LUMA_VPP 12, 16
-LUMA_VPP 16, 4
-LUMA_VPP 16, 8
-LUMA_VPP 16, 16
-LUMA_VPP 16, 32
-LUMA_VPP 16, 64
-LUMA_VPP 16, 12
-LUMA_VPP 24, 32
-LUMA_VPP 32, 8
-LUMA_VPP 32, 16
-LUMA_VPP 32, 32
-LUMA_VPP 32, 64
-LUMA_VPP 32, 24
-LUMA_VPP 48, 64
-LUMA_VPP 64, 16
-LUMA_VPP 64, 32
-LUMA_VPP 64, 64
-LUMA_VPP 64, 48
-
-// ***** luma_vps *****
-// void interp_vert_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
-.macro LUMA_VPS_4xN h
-function x265_interp_8tap_vert_ps_4x\h\()_neon
-    lsl             x3, x3, #1
-    lsl             x5, x4, #6
-    lsl             x4, x1, #2
-    sub             x4, x4, x1
-    sub             x0, x0, x4
-
-    mov             w6, #8192
-    dup             v28.4s, w6
-    mov             x4, #\h
-    movrel          x12, g_lumaFilter
-    add             x12, x12, x5
-    ld1r            {v16.2d}, x12, #8
-    ld1r            {v17.2d}, x12, #8
-    ld1r            {v18.2d}, x12, #8
-    ld1r            {v19.2d}, x12, #8
​

x265_3.6.tar.gz/source/common/aarch64/sad-a-common.S Deleted

@@ -1,514 +0,0 @@
-/*****************************************************************************
- * Copyright (C) 2022-2023 MulticoreWare, Inc
- *
- * Authors: David Chen <david.chen@myais.com.cn>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at license @ x265.com.
- *****************************************************************************/
-
-// This file contains the macros written using NEON instruction set
-// that are also used by the SVE2 functions
-
-#include "asm.S"
-
-.arch           armv8-a
-
-#ifdef __APPLE__
-.section __RODATA,__rodata
-#else
-.section .rodata
-#endif
-
-.align 4
-
-.macro SAD_START_4 f
-    ld1             {v0.s}0, x0, x1
-    ld1             {v0.s}1, x0, x1
-    ld1             {v1.s}0, x2, x3
-    ld1             {v1.s}1, x2, x3
-    \f              v16.8h, v0.8b, v1.8b
-.endm
-
-.macro SAD_4 h
-.rept \h / 2 - 1
-    SAD_START_4 uabal
-.endr
-.endm
-
-.macro SAD_START_8 f
-    ld1             {v0.8b}, x0, x1
-    ld1             {v1.8b}, x2, x3
-    ld1             {v2.8b}, x0, x1
-    ld1             {v3.8b}, x2, x3
-    \f              v16.8h, v0.8b, v1.8b
-    \f              v17.8h, v2.8b, v3.8b
-.endm
-
-.macro SAD_8 h
-.rept \h / 2 - 1
-    SAD_START_8 uabal
-.endr
-.endm
-
-.macro SAD_START_16 f
-    ld1             {v0.16b}, x0, x1
-    ld1             {v1.16b}, x2, x3
-    ld1             {v2.16b}, x0, x1
-    ld1             {v3.16b}, x2, x3
-    \f              v16.8h, v0.8b, v1.8b
-    \f\()2          v17.8h, v0.16b, v1.16b
-    uabal           v16.8h, v2.8b, v3.8b
-    uabal2          v17.8h, v2.16b, v3.16b
-.endm
-
-.macro SAD_16 h
-.rept \h / 2 - 1
-    SAD_START_16 uabal
-.endr
-.endm
-
-.macro SAD_START_32
-    movi            v16.16b, #0
-    movi            v17.16b, #0
-    movi            v18.16b, #0
-    movi            v19.16b, #0
-.endm
-
-.macro SAD_32
-    ld1             {v0.16b-v1.16b}, x0, x1
-    ld1             {v2.16b-v3.16b}, x2, x3
-    ld1             {v4.16b-v5.16b}, x0, x1
-    ld1             {v6.16b-v7.16b}, x2, x3
-    uabal           v16.8h, v0.8b, v2.8b
-    uabal2          v17.8h, v0.16b, v2.16b
-    uabal           v18.8h, v1.8b, v3.8b
-    uabal2          v19.8h, v1.16b, v3.16b
-    uabal           v16.8h, v4.8b, v6.8b
-    uabal2          v17.8h, v4.16b, v6.16b
-    uabal           v18.8h, v5.8b, v7.8b
-    uabal2          v19.8h, v5.16b, v7.16b
-.endm
-
-.macro SAD_END_32
-    add             v16.8h, v16.8h, v17.8h
-    add             v17.8h, v18.8h, v19.8h
-    add             v16.8h, v16.8h, v17.8h
-    uaddlv          s0, v16.8h
-    fmov            w0, s0
-    ret
-.endm
-
-.macro SAD_START_64
-    movi            v16.16b, #0
-    movi            v17.16b, #0
-    movi            v18.16b, #0
-    movi            v19.16b, #0
-    movi            v20.16b, #0
-    movi            v21.16b, #0
-    movi            v22.16b, #0
-    movi            v23.16b, #0
-.endm
-
-.macro SAD_64
-    ld1             {v0.16b-v3.16b}, x0, x1
-    ld1             {v4.16b-v7.16b}, x2, x3
-    ld1             {v24.16b-v27.16b}, x0, x1
-    ld1             {v28.16b-v31.16b}, x2, x3
-    uabal           v16.8h, v0.8b, v4.8b
-    uabal2          v17.8h, v0.16b, v4.16b
-    uabal           v18.8h, v1.8b, v5.8b
-    uabal2          v19.8h, v1.16b, v5.16b
-    uabal           v20.8h, v2.8b, v6.8b
-    uabal2          v21.8h, v2.16b, v6.16b
-    uabal           v22.8h, v3.8b, v7.8b
-    uabal2          v23.8h, v3.16b, v7.16b
-
-    uabal           v16.8h, v24.8b, v28.8b
-    uabal2          v17.8h, v24.16b, v28.16b
-    uabal           v18.8h, v25.8b, v29.8b
-    uabal2          v19.8h, v25.16b, v29.16b
-    uabal           v20.8h, v26.8b, v30.8b
-    uabal2          v21.8h, v26.16b, v30.16b
-    uabal           v22.8h, v27.8b, v31.8b
-    uabal2          v23.8h, v27.16b, v31.16b
-.endm
-
-.macro SAD_END_64
-    add             v16.8h, v16.8h, v17.8h
-    add             v17.8h, v18.8h, v19.8h
-    add             v16.8h, v16.8h, v17.8h
-    uaddlp          v16.4s, v16.8h
-    add             v18.8h, v20.8h, v21.8h
-    add             v19.8h, v22.8h, v23.8h
-    add             v17.8h, v18.8h, v19.8h
-    uaddlp          v17.4s, v17.8h
-    add             v16.4s, v16.4s, v17.4s
-    uaddlv          d0, v16.4s
-    fmov            x0, d0
-    ret
-.endm
-
-.macro SAD_START_12
-    movrel          x12, sad12_mask
-    ld1             {v31.16b}, x12
-    movi            v16.16b, #0
-    movi            v17.16b, #0
-.endm
-
-.macro SAD_12
-    ld1             {v0.16b}, x0, x1
-    and             v0.16b, v0.16b, v31.16b
-    ld1             {v1.16b}, x2, x3
-    and             v1.16b, v1.16b, v31.16b
-    ld1             {v2.16b}, x0, x1
-    and             v2.16b, v2.16b, v31.16b
-    ld1             {v3.16b}, x2, x3
-    and             v3.16b, v3.16b, v31.16b
-    uabal           v16.8h, v0.8b, v1.8b
-    uabal2          v17.8h, v0.16b, v1.16b
-    uabal           v16.8h, v2.8b, v3.8b
-    uabal2          v17.8h, v2.16b, v3.16b
-.endm
-
-.macro SAD_END_12
-    add             v16.8h, v16.8h, v17.8h
-    uaddlv          s0, v16.8h
-    fmov            w0, s0
-    ret
-.endm
-
-.macro SAD_START_24
-    movi            v16.16b, #0
-    movi            v17.16b, #0
-    movi            v18.16b, #0
-    sub             x1, x1, #16

 
@@ -1,514 +0,0 @@
-/*****************************************************************************
- * Copyright (C) 2022-2023 MulticoreWare, Inc
- *
- * Authors: David Chen <david.chen@myais.com.cn>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at license @ x265.com.
- *****************************************************************************/
-
-// This file contains the macros written using NEON instruction set
-// that are also used by the SVE2 functions
-
-#include "asm.S"
-
-.arch           armv8-a
-
-#ifdef __APPLE__
-.section __RODATA,__rodata
-#else
-.section .rodata
-#endif
-
-.align 4
-
-.macro SAD_START_4 f
-    ld1             {v0.s}0, x0, x1
-    ld1             {v0.s}1, x0, x1
-    ld1             {v1.s}0, x2, x3
-    ld1             {v1.s}1, x2, x3
-    \f              v16.8h, v0.8b, v1.8b
-.endm
-
-.macro SAD_4 h
-.rept \h / 2 - 1
-    SAD_START_4 uabal
-.endr
-.endm
-
-.macro SAD_START_8 f
-    ld1             {v0.8b}, x0, x1
-    ld1             {v1.8b}, x2, x3
-    ld1             {v2.8b}, x0, x1
-    ld1             {v3.8b}, x2, x3
-    \f              v16.8h, v0.8b, v1.8b
-    \f              v17.8h, v2.8b, v3.8b
-.endm
-
-.macro SAD_8 h
-.rept \h / 2 - 1
-    SAD_START_8 uabal
-.endr
-.endm
-
-.macro SAD_START_16 f
-    ld1             {v0.16b}, x0, x1
-    ld1             {v1.16b}, x2, x3
-    ld1             {v2.16b}, x0, x1
-    ld1             {v3.16b}, x2, x3
-    \f              v16.8h, v0.8b, v1.8b
-    \f\()2          v17.8h, v0.16b, v1.16b
-    uabal           v16.8h, v2.8b, v3.8b
-    uabal2          v17.8h, v2.16b, v3.16b
-.endm
-
-.macro SAD_16 h
-.rept \h / 2 - 1
-    SAD_START_16 uabal
-.endr
-.endm
-
-.macro SAD_START_32
-    movi            v16.16b, #0
-    movi            v17.16b, #0
-    movi            v18.16b, #0
-    movi            v19.16b, #0
-.endm
-
-.macro SAD_32
-    ld1             {v0.16b-v1.16b}, x0, x1
-    ld1             {v2.16b-v3.16b}, x2, x3
-    ld1             {v4.16b-v5.16b}, x0, x1
-    ld1             {v6.16b-v7.16b}, x2, x3
-    uabal           v16.8h, v0.8b, v2.8b
-    uabal2          v17.8h, v0.16b, v2.16b
-    uabal           v18.8h, v1.8b, v3.8b
-    uabal2          v19.8h, v1.16b, v3.16b
-    uabal           v16.8h, v4.8b, v6.8b
-    uabal2          v17.8h, v4.16b, v6.16b
-    uabal           v18.8h, v5.8b, v7.8b
-    uabal2          v19.8h, v5.16b, v7.16b
-.endm
-
-.macro SAD_END_32
-    add             v16.8h, v16.8h, v17.8h
-    add             v17.8h, v18.8h, v19.8h
-    add             v16.8h, v16.8h, v17.8h
-    uaddlv          s0, v16.8h
-    fmov            w0, s0
-    ret
-.endm
-
-.macro SAD_START_64
-    movi            v16.16b, #0
-    movi            v17.16b, #0
-    movi            v18.16b, #0
-    movi            v19.16b, #0
-    movi            v20.16b, #0
-    movi            v21.16b, #0
-    movi            v22.16b, #0
-    movi            v23.16b, #0
-.endm
-
-.macro SAD_64
-    ld1             {v0.16b-v3.16b}, x0, x1
-    ld1             {v4.16b-v7.16b}, x2, x3
-    ld1             {v24.16b-v27.16b}, x0, x1
-    ld1             {v28.16b-v31.16b}, x2, x3
-    uabal           v16.8h, v0.8b, v4.8b
-    uabal2          v17.8h, v0.16b, v4.16b
-    uabal           v18.8h, v1.8b, v5.8b
-    uabal2          v19.8h, v1.16b, v5.16b
-    uabal           v20.8h, v2.8b, v6.8b
-    uabal2          v21.8h, v2.16b, v6.16b
-    uabal           v22.8h, v3.8b, v7.8b
-    uabal2          v23.8h, v3.16b, v7.16b
-
-    uabal           v16.8h, v24.8b, v28.8b
-    uabal2          v17.8h, v24.16b, v28.16b
-    uabal           v18.8h, v25.8b, v29.8b
-    uabal2          v19.8h, v25.16b, v29.16b
-    uabal           v20.8h, v26.8b, v30.8b
-    uabal2          v21.8h, v26.16b, v30.16b
-    uabal           v22.8h, v27.8b, v31.8b
-    uabal2          v23.8h, v27.16b, v31.16b
-.endm
-
-.macro SAD_END_64
-    add             v16.8h, v16.8h, v17.8h
-    add             v17.8h, v18.8h, v19.8h
-    add             v16.8h, v16.8h, v17.8h
-    uaddlp          v16.4s, v16.8h
-    add             v18.8h, v20.8h, v21.8h
-    add             v19.8h, v22.8h, v23.8h
-    add             v17.8h, v18.8h, v19.8h
-    uaddlp          v17.4s, v17.8h
-    add             v16.4s, v16.4s, v17.4s
-    uaddlv          d0, v16.4s
-    fmov            x0, d0
-    ret
-.endm
-
-.macro SAD_START_12
-    movrel          x12, sad12_mask
-    ld1             {v31.16b}, x12
-    movi            v16.16b, #0
-    movi            v17.16b, #0
-.endm
-
-.macro SAD_12
-    ld1             {v0.16b}, x0, x1
-    and             v0.16b, v0.16b, v31.16b
-    ld1             {v1.16b}, x2, x3
-    and             v1.16b, v1.16b, v31.16b
-    ld1             {v2.16b}, x0, x1
-    and             v2.16b, v2.16b, v31.16b
-    ld1             {v3.16b}, x2, x3
-    and             v3.16b, v3.16b, v31.16b
-    uabal           v16.8h, v0.8b, v1.8b
-    uabal2          v17.8h, v0.16b, v1.16b
-    uabal           v16.8h, v2.8b, v3.8b
-    uabal2          v17.8h, v2.16b, v3.16b
-.endm
-
-.macro SAD_END_12
-    add             v16.8h, v16.8h, v17.8h
-    uaddlv          s0, v16.8h
-    fmov            w0, s0
-    ret
-.endm
-
-.macro SAD_START_24
-    movi            v16.16b, #0
-    movi            v17.16b, #0
-    movi            v18.16b, #0
-    sub             x1, x1, #16
​

x265_3.6.tar.gz/source/common/aarch64/sad-a-sve2.S Deleted

@@ -1,511 +0,0 @@
-/*****************************************************************************
- * Copyright (C) 2022-2023 MulticoreWare, Inc
- *
- * Authors: David Chen <david.chen@myais.com.cn>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at license @ x265.com.
- *****************************************************************************/
-
-#include "asm-sve.S"
-#include "sad-a-common.S"
-
-.arch armv8-a+sve2
-
-#ifdef __APPLE__
-.section __RODATA,__rodata
-#else
-.section .rodata
-#endif
-
-.align 4
-
-.text
-
-.macro SAD_SVE2_16 h
-    mov             z16.d, #0
-    ptrue           p0.h, vl16
-.rept \h
-    ld1b            {z0.h}, p0/z, x0
-    ld1b            {z2.h}, p0/z, x2
-    add             x0, x0, x1
-    add             x2, x2, x3
-    uaba            z16.h, z0.h, z2.h
-.endr
-    uaddv           d0, p0, z16.h
-    fmov            w0, s0
-    ret
-.endm
-
-.macro SAD_SVE2_32 h
-    ptrue           p0.b, vl32
-.rept \h
-    ld1b            {z0.b}, p0/z, x0
-    ld1b            {z4.b}, p0/z, x2
-    add             x0, x0, x1
-    add             x2, x2, x3
-    uabalb          z16.h, z0.b, z4.b
-    uabalt          z16.h, z0.b, z4.b
-.endr
-    uaddv           d0, p0, z16.h
-    fmov            w0, s0
-    ret
-.endm
-
-.macro SAD_SVE2_64 h
-    cmp             x9, #48
-    bgt             .vl_gt_48_pixel_sad_64x\h
-    mov             z16.d, #0
-    mov             z17.d, #0
-    mov             z18.d, #0
-    mov             z19.d, #0
-    ptrue           p0.b, vl32
-.rept \h
-    ld1b            {z0.b}, p0/z, x0
-    ld1b            {z1.b}, p0/z, x0, #1, mul vl
-    ld1b            {z4.b}, p0/z, x2
-    ld1b            {z5.b}, p0/z, x2, #1, mul vl
-    add             x0, x0, x1
-    add             x2, x2, x3
-    uabalb          z16.h, z0.b, z4.b
-    uabalt          z17.h, z0.b, z4.b
-    uabalb          z18.h, z1.b, z5.b
-    uabalt          z19.h, z1.b, z5.b
-.endr
-    add             z16.h, z16.h, z17.h
-    add             z17.h, z18.h, z19.h
-    add             z16.h, z16.h, z17.h
-    uadalp          z24.s, p0/m, z16.h
-    uaddv           d5, p0, z24.s
-    fmov            x0, d5
-    ret
-.vl_gt_48_pixel_sad_64x\h\():
-    mov             z16.d, #0
-    mov             z17.d, #0
-    mov             z24.d, #0
-    ptrue           p0.b, vl64
-.rept \h
-    ld1b            {z0.b}, p0/z, x0
-    ld1b            {z4.b}, p0/z, x2
-    add             x0, x0, x1
-    add             x2, x2, x3
-    uabalb          z16.h, z0.b, z4.b
-    uabalt          z17.h, z0.b, z4.b
-.endr
-    add             z16.h, z16.h, z17.h
-    uadalp          z24.s, p0/m, z16.h
-    uaddv           d5, p0, z24.s
-    fmov            x0, d5
-    ret
-.endm
-
-.macro SAD_SVE2_24 h
-    mov             z16.d, #0
-    mov             x10, #24
-    mov             x11, #0
-    whilelt         p0.b, x11, x10
-.rept \h
-    ld1b            {z0.b}, p0/z, x0
-    ld1b            {z8.b}, p0/z, x2
-    add             x0, x0, x1
-    add             x2, x2, x3
-    uabalb          z16.h, z0.b, z8.b
-    uabalt          z16.h, z0.b, z8.b
-.endr
-    uaddv           d5, p0, z16.h
-    fmov            w0, s5
-    ret
-.endm
-
-.macro SAD_SVE2_48 h
-    cmp             x9, #48
-    bgt             .vl_gt_48_pixel_sad_48x\h
-    mov             z16.d, #0
-    mov             z17.d, #0
-    mov             z18.d, #0
-    mov             z19.d, #0
-    ptrue           p0.b, vl32
-    ptrue           p1.b, vl16
-.rept \h
-    ld1b            {z0.b}, p0/z, x0
-    ld1b            {z1.b}, p1/z, x0, #1, mul vl
-    ld1b            {z8.b}, p0/z, x2
-    ld1b            {z9.b}, p1/z, x2, #1, mul vl
-    add             x0, x0, x1
-    add             x2, x2, x3
-    uabalb          z16.h, z0.b, z8.b
-    uabalt          z17.h, z0.b, z8.b
-    uabalb          z18.h, z1.b, z9.b
-    uabalt          z19.h, z1.b, z9.b
-.endr
-    add             z16.h, z16.h, z17.h
-    add             z17.h, z18.h, z19.h
-    add             z16.h, z16.h, z17.h
-    uaddv           d5, p0, z16.h
-    fmov            w0, s5
-    ret
-.vl_gt_48_pixel_sad_48x\h\():
-    mov             z16.d, #0
-    mov             z17.d, #0
-    mov             x10, #48
-    mov             x11, #0
-    whilelt         p0.b, x11, x10
-.rept \h
-    ld1b            {z0.b}, p0/z, x0
-    ld1b            {z8.b}, p0/z, x2
-    add             x0, x0, x1
-    add             x2, x2, x3
-    uabalb          z16.h, z0.b, z8.b
-    uabalt          z17.h, z0.b, z8.b
-.endr
-    add             z16.h, z16.h, z17.h
-    uaddv           d5, p0, z16.h
-    fmov            w0, s5
-    ret
-.endm
-
-// Fully unrolled.
-.macro SAD_FUNC_SVE2 w, h
-function PFX(pixel_sad_\w\()x\h\()_sve2)
-    rdvl            x9, #1
-    cmp             x9, #16
-    bgt             .vl_gt_16_pixel_sad_\w\()x\h
-    SAD_START_\w uabdl
-    SAD_\w \h
-.if \w > 4
-    add             v16.8h, v16.8h, v17.8h
-.endif
-    uaddlv          s0, v16.8h
-    fmov            w0, s0
-    ret
-.vl_gt_16_pixel_sad_\w\()x\h\():
-.if \w == 4 || \w == 8 || \w == 12
-    SAD_START_\w uabdl
-    SAD_\w \h
-.if \w > 4

 
@@ -1,511 +0,0 @@
-/*****************************************************************************
- * Copyright (C) 2022-2023 MulticoreWare, Inc
- *
- * Authors: David Chen <david.chen@myais.com.cn>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at license @ x265.com.
- *****************************************************************************/
-
-#include "asm-sve.S"
-#include "sad-a-common.S"
-
-.arch armv8-a+sve2
-
-#ifdef __APPLE__
-.section __RODATA,__rodata
-#else
-.section .rodata
-#endif
-
-.align 4
-
-.text
-
-.macro SAD_SVE2_16 h
-    mov             z16.d, #0
-    ptrue           p0.h, vl16
-.rept \h
-    ld1b            {z0.h}, p0/z, x0
-    ld1b            {z2.h}, p0/z, x2
-    add             x0, x0, x1
-    add             x2, x2, x3
-    uaba            z16.h, z0.h, z2.h
-.endr
-    uaddv           d0, p0, z16.h
-    fmov            w0, s0
-    ret
-.endm
-
-.macro SAD_SVE2_32 h
-    ptrue           p0.b, vl32
-.rept \h
-    ld1b            {z0.b}, p0/z, x0
-    ld1b            {z4.b}, p0/z, x2
-    add             x0, x0, x1
-    add             x2, x2, x3
-    uabalb          z16.h, z0.b, z4.b
-    uabalt          z16.h, z0.b, z4.b
-.endr
-    uaddv           d0, p0, z16.h
-    fmov            w0, s0
-    ret
-.endm
-
-.macro SAD_SVE2_64 h
-    cmp             x9, #48
-    bgt             .vl_gt_48_pixel_sad_64x\h
-    mov             z16.d, #0
-    mov             z17.d, #0
-    mov             z18.d, #0
-    mov             z19.d, #0
-    ptrue           p0.b, vl32
-.rept \h
-    ld1b            {z0.b}, p0/z, x0
-    ld1b            {z1.b}, p0/z, x0, #1, mul vl
-    ld1b            {z4.b}, p0/z, x2
-    ld1b            {z5.b}, p0/z, x2, #1, mul vl
-    add             x0, x0, x1
-    add             x2, x2, x3
-    uabalb          z16.h, z0.b, z4.b
-    uabalt          z17.h, z0.b, z4.b
-    uabalb          z18.h, z1.b, z5.b
-    uabalt          z19.h, z1.b, z5.b
-.endr
-    add             z16.h, z16.h, z17.h
-    add             z17.h, z18.h, z19.h
-    add             z16.h, z16.h, z17.h
-    uadalp          z24.s, p0/m, z16.h
-    uaddv           d5, p0, z24.s
-    fmov            x0, d5
-    ret
-.vl_gt_48_pixel_sad_64x\h\():
-    mov             z16.d, #0
-    mov             z17.d, #0
-    mov             z24.d, #0
-    ptrue           p0.b, vl64
-.rept \h
-    ld1b            {z0.b}, p0/z, x0
-    ld1b            {z4.b}, p0/z, x2
-    add             x0, x0, x1
-    add             x2, x2, x3
-    uabalb          z16.h, z0.b, z4.b
-    uabalt          z17.h, z0.b, z4.b
-.endr
-    add             z16.h, z16.h, z17.h
-    uadalp          z24.s, p0/m, z16.h
-    uaddv           d5, p0, z24.s
-    fmov            x0, d5
-    ret
-.endm
-
-.macro SAD_SVE2_24 h
-    mov             z16.d, #0
-    mov             x10, #24
-    mov             x11, #0
-    whilelt         p0.b, x11, x10
-.rept \h
-    ld1b            {z0.b}, p0/z, x0
-    ld1b            {z8.b}, p0/z, x2
-    add             x0, x0, x1
-    add             x2, x2, x3
-    uabalb          z16.h, z0.b, z8.b
-    uabalt          z16.h, z0.b, z8.b
-.endr
-    uaddv           d5, p0, z16.h
-    fmov            w0, s5
-    ret
-.endm
-
-.macro SAD_SVE2_48 h
-    cmp             x9, #48
-    bgt             .vl_gt_48_pixel_sad_48x\h
-    mov             z16.d, #0
-    mov             z17.d, #0
-    mov             z18.d, #0
-    mov             z19.d, #0
-    ptrue           p0.b, vl32
-    ptrue           p1.b, vl16
-.rept \h
-    ld1b            {z0.b}, p0/z, x0
-    ld1b            {z1.b}, p1/z, x0, #1, mul vl
-    ld1b            {z8.b}, p0/z, x2
-    ld1b            {z9.b}, p1/z, x2, #1, mul vl
-    add             x0, x0, x1
-    add             x2, x2, x3
-    uabalb          z16.h, z0.b, z8.b
-    uabalt          z17.h, z0.b, z8.b
-    uabalb          z18.h, z1.b, z9.b
-    uabalt          z19.h, z1.b, z9.b
-.endr
-    add             z16.h, z16.h, z17.h
-    add             z17.h, z18.h, z19.h
-    add             z16.h, z16.h, z17.h
-    uaddv           d5, p0, z16.h
-    fmov            w0, s5
-    ret
-.vl_gt_48_pixel_sad_48x\h\():
-    mov             z16.d, #0
-    mov             z17.d, #0
-    mov             x10, #48
-    mov             x11, #0
-    whilelt         p0.b, x11, x10
-.rept \h
-    ld1b            {z0.b}, p0/z, x0
-    ld1b            {z8.b}, p0/z, x2
-    add             x0, x0, x1
-    add             x2, x2, x3
-    uabalb          z16.h, z0.b, z8.b
-    uabalt          z17.h, z0.b, z8.b
-.endr
-    add             z16.h, z16.h, z17.h
-    uaddv           d5, p0, z16.h
-    fmov            w0, s5
-    ret
-.endm
-
-// Fully unrolled.
-.macro SAD_FUNC_SVE2 w, h
-function PFX(pixel_sad_\w\()x\h\()_sve2)
-    rdvl            x9, #1
-    cmp             x9, #16
-    bgt             .vl_gt_16_pixel_sad_\w\()x\h
-    SAD_START_\w uabdl
-    SAD_\w \h
-.if \w > 4
-    add             v16.8h, v16.8h, v17.8h
-.endif
-    uaddlv          s0, v16.8h
-    fmov            w0, s0
-    ret
-.vl_gt_16_pixel_sad_\w\()x\h\():
-.if \w == 4 || \w == 8 || \w == 12
-    SAD_START_\w uabdl
-    SAD_\w \h
-.if \w > 4
​

x265_3.6.tar.gz/source/common/aarch64/ssd-a-sve.S Deleted

@@ -1,78 +0,0 @@
-/*****************************************************************************
- * Copyright (C) 2022-2023 MulticoreWare, Inc
- *
- * Authors: David Chen <david.chen@myais.com.cn>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at license @ x265.com.
- *****************************************************************************/
-
-#include "asm-sve.S"
-
-.arch armv8-a+sve
-
-#ifdef __APPLE__
-.section __RODATA,__rodata
-#else
-.section .rodata
-#endif
-
-.align 4
-
-.text
-
-function PFX(pixel_sse_pp_4x4_sve)
-    ptrue           p0.s, vl4
-    ld1b            {z0.s}, p0/z, x0
-    ld1b            {z17.s}, p0/z, x2
-    add             x0, x0, x1
-    add             x2, x2, x3
-    sub             z0.s, p0/m, z0.s, z17.s
-    mul             z0.s, p0/m, z0.s, z0.s
-.rept 3
-    ld1b            {z16.s}, p0/z, x0
-    ld1b            {z17.s}, p0/z, x2
-    add             x0, x0, x1
-    add             x2, x2, x3
-    sub             z16.s, p0/m, z16.s, z17.s
-    mla             z0.s, p0/m, z16.s, z16.s
-.endr
-    uaddv           d0, p0, z0.s
-    fmov            w0, s0
-    ret
-endfunc
-
-function PFX(pixel_sse_pp_4x8_sve)
-    ptrue           p0.s, vl4
-    ld1b            {z0.s}, p0/z, x0
-    ld1b            {z17.s}, p0/z, x2
-    add             x0, x0, x1
-    add             x2, x2, x3
-    sub             z0.s, p0/m, z0.s, z17.s
-    mul             z0.s, p0/m, z0.s, z0.s
-.rept 7
-    ld1b            {z16.s}, p0/z, x0
-    ld1b            {z17.s}, p0/z, x2
-    add             x0, x0, x1
-    add             x2, x2, x3
-    sub             z16.s, p0/m, z16.s, z17.s
-    mla             z0.s, p0/m, z16.s, z16.s
-.endr
-    uaddv           d0, p0, z0.s
-    fmov            w0, s0
-    ret
-endfunc

 
@@ -1,78 +0,0 @@
-/*****************************************************************************
- * Copyright (C) 2022-2023 MulticoreWare, Inc
- *
- * Authors: David Chen <david.chen@myais.com.cn>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at license @ x265.com.
- *****************************************************************************/
-
-#include "asm-sve.S"
-
-.arch armv8-a+sve
-
-#ifdef __APPLE__
-.section __RODATA,__rodata
-#else
-.section .rodata
-#endif
-
-.align 4
-
-.text
-
-function PFX(pixel_sse_pp_4x4_sve)
-    ptrue           p0.s, vl4
-    ld1b            {z0.s}, p0/z, x0
-    ld1b            {z17.s}, p0/z, x2
-    add             x0, x0, x1
-    add             x2, x2, x3
-    sub             z0.s, p0/m, z0.s, z17.s
-    mul             z0.s, p0/m, z0.s, z0.s
-.rept 3
-    ld1b            {z16.s}, p0/z, x0
-    ld1b            {z17.s}, p0/z, x2
-    add             x0, x0, x1
-    add             x2, x2, x3
-    sub             z16.s, p0/m, z16.s, z17.s
-    mla             z0.s, p0/m, z16.s, z16.s
-.endr
-    uaddv           d0, p0, z0.s
-    fmov            w0, s0
-    ret
-endfunc
-
-function PFX(pixel_sse_pp_4x8_sve)
-    ptrue           p0.s, vl4
-    ld1b            {z0.s}, p0/z, x0
-    ld1b            {z17.s}, p0/z, x2
-    add             x0, x0, x1
-    add             x2, x2, x3
-    sub             z0.s, p0/m, z0.s, z17.s
-    mul             z0.s, p0/m, z0.s, z0.s
-.rept 7
-    ld1b            {z16.s}, p0/z, x0
-    ld1b            {z17.s}, p0/z, x2
-    add             x0, x0, x1
-    add             x2, x2, x3
-    sub             z16.s, p0/m, z16.s, z17.s
-    mla             z0.s, p0/m, z16.s, z16.s
-.endr
-    uaddv           d0, p0, z0.s
-    fmov            w0, s0
-    ret
-endfunc
​

x265_4.0.tar.gz/.readthedocs.yaml Added

 
@@ -0,0 +1,27 @@
+# Read the Docs configuration file for Sphinx projects
+# .readthedocs.yaml
+
+# Project Information
+# Required
+version: 2
+
+build:
+  os: "ubuntu-20.04"
+  tools:
+    python: "3.10"
+
+# Use a requirements file for pip dependencies
+python:
+  install:
+    - requirements: doc/requirements.txt
+    
+# Build documentation in the "docs/" directory with Sphinx
+sphinx:
+  builder: html
+  configuration: doc/reST/conf.py
+  fail_on_warning: false
+
+# Optionally build your docs in additional formats such as PDF and ePub
+# formats:
+#   - pdf
+#   - epub
​

x265_3.6.tar.gz/build/README.txt -> x265_4.0.tar.gz/build/README.txt Changed

@@ -94,22 +94,42 @@
 
 = Build Instructions for cross-compilation for Arm AArch64 Targets=
 
-When the target platform is based on Arm AArch64 architecture, the x265 can be
-built in x86 platforms. However, the CMAKE_C_COMPILER and CMAKE_CXX_COMPILER
-enviroment variables should be set to point to the cross compilers of the
-appropriate gcc. For example:
+Cross compilation of x265 for AArch64 targets is possible on x86 platforms by
+passing a toolchain file when running CMake to configure the project:
 
-1. export CMAKE_C_COMPILER=aarch64-unknown-linux-gnu-gcc
-2. export CMAKE_CXX_COMPILER=aarch64-unknown-linux-gnu-g++
+* cmake -DCMAKE_TOOLCHAIN_FILE=<path-to-toolchain-file>
 
-The default ones are aarch64-linux-gnu-gcc and aarch64-linux-gnu-g++.
-Then, the normal building process can be followed.
+Toolchain files for AArch64 cross-compilation exist in the /build directory.
+These specify a default cross-compiler to use; however this can be overridden
+by setting the CMAKE_C_COMPILER and CMAKE_CXX_COMPILER CMake variables when
+running CMake to configure the project. For example:
 
-Moreover, if the target platform supports SVE or SVE2 instruction set, the
-CROSS_COMPILE_SVE or CROSS_COMPILE_SVE2 environment variables should be set
-to true, respectively. For example:
+* cmake -DCMAKE_C_COMPILER=aarch64-linux-gnu-gcc -DCMAKE_CXX_COMPILER=aarch64-linux-gnu-g++
 
-1. export CROSS_COMPILE_SVE2=true
-2. export CROSS_COMPILE_SVE=true
+If target platform supports Armv8.4 Neon DotProd instructions, the
+CROSS_COMPILE_NEON_DOTPROD CMake option should be set to ON:
 
-Then, the normal building process can be followed.
+* cmake -DCROSS_COMPILE_NEON_DOTPROD=ON  <other configuration options...>
+
+If target platform supports Armv8.6 Neon I8MM instructions, the
+CROSS_COMPILE_NEON_I8MM CMake option should be set to ON:
+
+* cmake -DCROSS_COMPILE_NEON_I8MM=ON  <other configuration options...>
+
+If the target platform supports SVE or SVE2, CROSS_COMPILE_SVE or
+CROSS_COMPILE_SVE2 CMake options should be set to ON, respectively.
+For example, when running CMake to configure the project:
+
+1. cmake -DCROSS_COMPILE_SVE=ON  <other configuration options...>
+2. cmake -DCROSS_COMPILE_SVE2=ON <other configuration options...>
+
+Note: when the CROSS_COMPILE_SVE option is set to ON the build configuration will
+also compile for Neon DotProd and I8MM, as we impose the constraint that SVE implies
+both Neon DotProd and I8MM.
+
+Similarly when the CROSS_COMPILE_SVE2 option is set to ON the build configuration
+will also compile for Neon I8MM, as we impose the constraint that SVE2 implies Neon
+I8MM. SVE2 already implies that Neon DotProd is implemented since SVE2 is an Armv9.0
+feature which implies Armv8.5, and Neon DotProd is mandatory from Armv8.4.
+
+Then, the normal build process can be followed.

 
@@ -94,22 +94,42 @@
 
 = Build Instructions for cross-compilation for Arm AArch64 Targets=
 
-When the target platform is based on Arm AArch64 architecture, the x265 can be
-built in x86 platforms. However, the CMAKE_C_COMPILER and CMAKE_CXX_COMPILER
-enviroment variables should be set to point to the cross compilers of the
-appropriate gcc. For example:
+Cross compilation of x265 for AArch64 targets is possible on x86 platforms by
+passing a toolchain file when running CMake to configure the project:
 
-1. export CMAKE_C_COMPILER=aarch64-unknown-linux-gnu-gcc
-2. export CMAKE_CXX_COMPILER=aarch64-unknown-linux-gnu-g++
+* cmake -DCMAKE_TOOLCHAIN_FILE=<path-to-toolchain-file>
 
-The default ones are aarch64-linux-gnu-gcc and aarch64-linux-gnu-g++.
-Then, the normal building process can be followed.
+Toolchain files for AArch64 cross-compilation exist in the /build directory.
+These specify a default cross-compiler to use; however this can be overridden
+by setting the CMAKE_C_COMPILER and CMAKE_CXX_COMPILER CMake variables when
+running CMake to configure the project. For example:
 
-Moreover, if the target platform supports SVE or SVE2 instruction set, the
-CROSS_COMPILE_SVE or CROSS_COMPILE_SVE2 environment variables should be set
-to true, respectively. For example:
+* cmake -DCMAKE_C_COMPILER=aarch64-linux-gnu-gcc -DCMAKE_CXX_COMPILER=aarch64-linux-gnu-g++
 
-1. export CROSS_COMPILE_SVE2=true
-2. export CROSS_COMPILE_SVE=true
+If target platform supports Armv8.4 Neon DotProd instructions, the
+CROSS_COMPILE_NEON_DOTPROD CMake option should be set to ON:
 
-Then, the normal building process can be followed.
+* cmake -DCROSS_COMPILE_NEON_DOTPROD=ON  <other configuration options...>
+
+If target platform supports Armv8.6 Neon I8MM instructions, the
+CROSS_COMPILE_NEON_I8MM CMake option should be set to ON:
+
+* cmake -DCROSS_COMPILE_NEON_I8MM=ON  <other configuration options...>
+
+If the target platform supports SVE or SVE2, CROSS_COMPILE_SVE or
+CROSS_COMPILE_SVE2 CMake options should be set to ON, respectively.
+For example, when running CMake to configure the project:
+
+1. cmake -DCROSS_COMPILE_SVE=ON  <other configuration options...>
+2. cmake -DCROSS_COMPILE_SVE2=ON <other configuration options...>
+
+Note: when the CROSS_COMPILE_SVE option is set to ON the build configuration will
+also compile for Neon DotProd and I8MM, as we impose the constraint that SVE implies
+both Neon DotProd and I8MM.
+
+Similarly when the CROSS_COMPILE_SVE2 option is set to ON the build configuration
+will also compile for Neon I8MM, as we impose the constraint that SVE2 implies Neon
+I8MM. SVE2 already implies that Neon DotProd is implemented since SVE2 is an Armv9.0
+feature which implies Armv8.5, and Neon DotProd is mandatory from Armv8.4.
+
+Then, the normal build process can be followed.
​

x265_3.6.tar.gz/build/aarch64-darwin/crosscompile.cmake -> x265_4.0.tar.gz/build/aarch64-darwin/crosscompile.cmake Changed

 
@@ -7,17 +7,14 @@
 set(CMAKE_SYSTEM_NAME Darwin)
 set(CMAKE_SYSTEM_PROCESSOR aarch64)
 
-# specify the cross compiler
-set(CMAKE_C_COMPILER gcc-12)
-set(CMAKE_CXX_COMPILER g++-12)
+# specify the cross compiler (giving precedence to user-supplied CC/CXX)
+if(NOT DEFINED CMAKE_C_COMPILER)
+    set(CMAKE_C_COMPILER gcc)
+endif()
+if(NOT DEFINED CMAKE_CXX_COMPILER)
+    set(CMAKE_CXX_COMPILER g++)
+endif()
 
 # specify the target environment
 SET(CMAKE_FIND_ROOT_PATH  /opt/homebrew/bin/)
 
-# specify whether SVE/SVE2 is supported by the target platform
-if(DEFINED ENV{CROSS_COMPILE_SVE2})
-    set(CROSS_COMPILE_SVE2 1)
-elseif(DEFINED ENV{CROSS_COMPILE_SVE})
-    set(CROSS_COMPILE_SVE 1)
-endif()
-
​

x265_4.0.tar.gz/build/aarch64-linux-clang Added

 
+(directory)
​

x265_4.0.tar.gz/build/aarch64-linux-clang/crosscompile.cmake Added

 
@@ -0,0 +1,25 @@
+# CMake toolchain file for cross compiling x265 for AArch64, using Clang.
+
+set(CROSS_COMPILE_ARM64 1)
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_PROCESSOR aarch64)
+
+set(TARGET_TRIPLE aarch64-linux-gnu)
+
+# specify the cross compiler (giving precedence to user-supplied CC/CXX)
+if(NOT DEFINED CMAKE_C_COMPILER)
+    set(CMAKE_C_COMPILER clang)
+endif()
+if(NOT DEFINED CMAKE_CXX_COMPILER)
+    set(CMAKE_CXX_COMPILER clang++)
+endif()
+
+# specify compiler target
+set(CMAKE_C_COMPILER_TARGET ${TARGET_TRIPLE})
+set(CMAKE_CXX_COMPILER_TARGET ${TARGET_TRIPLE})
+
+# specify assembler target
+list(APPEND ASM_FLAGS "--target=${TARGET_TRIPLE}")
+
+# specify the target environment
+SET(CMAKE_FIND_ROOT_PATH /usr/aarch64-linux-gnu)
​

x265_3.6.tar.gz/build/aarch64-linux/crosscompile.cmake -> x265_4.0.tar.gz/build/aarch64-linux/crosscompile.cmake Changed

 
@@ -7,25 +7,14 @@
 set(CMAKE_SYSTEM_NAME Linux)
 set(CMAKE_SYSTEM_PROCESSOR aarch64)
 
-# specify the cross compiler
-if(DEFINED ENV{CMAKE_C_COMPILER})
-    set(CMAKE_C_COMPILER $ENV{CMAKE_C_COMPILER})
-else()
+# specify the cross compiler (giving precedence to user-supplied CC/CXX)
+if(NOT DEFINED CMAKE_C_COMPILER)
     set(CMAKE_C_COMPILER aarch64-linux-gnu-gcc)
 endif()
-if(DEFINED ENV{CMAKE_CXX_COMPILER})
-    set(CMAKE_CXX_COMPILER $ENV{CMAKE_CXX_COMPILER})
-else()
+if(NOT DEFINED CMAKE_CXX_COMPILER)
     set(CMAKE_CXX_COMPILER aarch64-linux-gnu-g++)
 endif()
 
 # specify the target environment
 SET(CMAKE_FIND_ROOT_PATH  /usr/aarch64-linux-gnu)
 
-# specify whether SVE/SVE2 is supported by the target platform
-if(DEFINED ENV{CROSS_COMPILE_SVE2})
-    set(CROSS_COMPILE_SVE2 1)
-elseif(DEFINED ENV{CROSS_COMPILE_SVE})
-    set(CROSS_COMPILE_SVE 1)
-endif()
-
​

x265_4.0.tar.gz/build/vc17-x86 Added

 
+(directory)
​

x265_4.0.tar.gz/build/vc17-x86/build-all.bat Added

 
@@ -0,0 +1,23 @@
+@echo off
+setlocal enabledelayedexpansion
+if "%VS170COMNTOOLS%" == "" (
+for /f "usebackq tokens=1* delims=: " %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -latest `) do (
+  if /i "%%i"=="productPath" (
+        set VS170COMNTOOLS=%%j
+)
+)
+)
+setx VS170COMNTOOLS "!VS170COMNTOOLS!"
+if "%VS170COMNTOOLS%" == "" (
+  msg "%username%" "Visual Studio 17 not detected"
+  exit 1
+)
+if not exist x265.sln (
+  call make-solutions.bat
+)
+if exist x265.sln (
+  call "%VS170COMNTOOLS%\..\..\tools\VsDevCmd.bat"
+  MSBuild /property:Configuration="Release" x265.sln
+  MSBuild /property:Configuration="Debug" x265.sln
+  MSBuild /property:Configuration="RelWithDebInfo" x265.sln
+)
​

x265_4.0.tar.gz/build/vc17-x86/make-solutions.bat Added

 
@@ -0,0 +1,6 @@
+@echo off
+::
+:: run this batch file to create a Visual Studio solution file for this project.
+:: See the cmake documentation for other generator targets
+::
+cmake -G "Visual Studio 17 2022" ..\..\source && cmake-gui ..\..\source
​

x265_4.0.tar.gz/build/vc17-x86_64 Added

 
+(directory)
​

x265_4.0.tar.gz/build/vc17-x86_64/build-all.bat Added

 
@@ -0,0 +1,23 @@
+@echo off
+setlocal enabledelayedexpansion
+if "%VS170COMNTOOLS%" == "" (
+for /f "usebackq tokens=1* delims=: " %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -latest `) do (
+  if /i "%%i"=="productPath" (
+        set VS170COMNTOOLS=%%j
+)
+)
+)
+setx VS170COMNTOOLS "!VS170COMNTOOLS!"
+if "%VS170COMNTOOLS%" == "" (
+  msg "%username%" "Visual Studio 17 not detected"
+  exit 1
+)
+if not exist x265.sln (
+  call make-solutions.bat
+)
+if exist x265.sln (
+  call "%VS170COMNTOOLS%\..\..\tools\VsDevCmd.bat"
+  MSBuild /property:Configuration="Release" x265.sln
+  MSBuild /property:Configuration="Debug" x265.sln
+  MSBuild /property:Configuration="RelWithDebInfo" x265.sln
+)
​

x265_4.0.tar.gz/build/vc17-x86_64/make-solutions.bat Added

 
@@ -0,0 +1,6 @@
+@echo off
+::
+:: run this batch file to create a Visual Studio solution file for this project.
+:: See the cmake documentation for other generator targets
+::
+cmake -G "Visual Studio 17 2022" ..\..\source && cmake-gui ..\..\source
​

x265_4.0.tar.gz/build/vc17-x86_64/multilib.bat Added

@@ -0,0 +1,47 @@
+@echo off
+setlocal enabledelayedexpansion
+if "%VS170COMNTOOLS%" == "" (
+for /f "usebackq tokens=1* delims=: " %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -latest `) do (
+  if /i "%%i"=="productPath" (
+        set VS170COMNTOOLS=%%j
+)
+)
+)
+setx VS170COMNTOOLS "!VS170COMNTOOLS!"
+call "%VS170COMNTOOLS%\..\..\tools\VsDevCmd.bat"
+@mkdir 12bit
+@mkdir 10bit
+@mkdir 8bit
+
+@cd 12bit
+cmake -G "Visual Studio 17 2022" ../../../source -DHIGH_BIT_DEPTH=ON -DEXPORT_C_API=OFF -DENABLE_SHARED=OFF -DENABLE_CLI=OFF -DMAIN12=ON
+if exist x265.sln (
+  MSBuild /property:Configuration="Release" x265.sln
+  copy/y Release\x265-static.lib ..\8bit\x265-static-main12.lib
+)
+
+@cd ..\10bit
+cmake -G "Visual Studio 17 2022" ../../../source -DHIGH_BIT_DEPTH=ON -DEXPORT_C_API=OFF -DENABLE_SHARED=OFF -DENABLE_CLI=OFF
+if exist x265.sln (
+  MSBuild /property:Configuration="Release" x265.sln
+  copy/y Release\x265-static.lib ..\8bit\x265-static-main10.lib
+)
+
+@cd ..\8bit
+if not exist x265-static-main10.lib (
+  msg "%username%" "10bit build failed"
+  exit 1
+)
+if not exist x265-static-main12.lib (
+  msg "%username%" "12bit build failed"
+  exit 1
+)
+cmake -G "Visual Studio 17 2022" ../../../source -DEXTRA_LIB="x265-static-main10.lib;x265-static-main12.lib" -DLINKED_10BIT=ON -DLINKED_12BIT=ON
+if exist x265.sln (
+  MSBuild /property:Configuration="Release" x265.sln
+  :: combine static libraries (ignore warnings caused by winxp.cpp hacks)
+  move Release\x265-static.lib x265-static-main.lib
+  LIB.EXE /ignore:4006 /ignore:4221 /OUT:Release\x265-static.lib x265-static-main.lib x265-static-main10.lib x265-static-main12.lib
+)
+
+pause
\ No newline at end of file

 
@@ -0,0 +1,47 @@
+@echo off
+setlocal enabledelayedexpansion
+if "%VS170COMNTOOLS%" == "" (
+for /f "usebackq tokens=1* delims=: " %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -latest `) do (
+  if /i "%%i"=="productPath" (
+        set VS170COMNTOOLS=%%j
+)
+)
+)
+setx VS170COMNTOOLS "!VS170COMNTOOLS!"
+call "%VS170COMNTOOLS%\..\..\tools\VsDevCmd.bat"
+@mkdir 12bit
+@mkdir 10bit
+@mkdir 8bit
+
+@cd 12bit
+cmake -G "Visual Studio 17 2022" ../../../source -DHIGH_BIT_DEPTH=ON -DEXPORT_C_API=OFF -DENABLE_SHARED=OFF -DENABLE_CLI=OFF -DMAIN12=ON
+if exist x265.sln (
+  MSBuild /property:Configuration="Release" x265.sln
+  copy/y Release\x265-static.lib ..\8bit\x265-static-main12.lib
+)
+
+@cd ..\10bit
+cmake -G "Visual Studio 17 2022" ../../../source -DHIGH_BIT_DEPTH=ON -DEXPORT_C_API=OFF -DENABLE_SHARED=OFF -DENABLE_CLI=OFF
+if exist x265.sln (
+  MSBuild /property:Configuration="Release" x265.sln
+  copy/y Release\x265-static.lib ..\8bit\x265-static-main10.lib
+)
+
+@cd ..\8bit
+if not exist x265-static-main10.lib (
+  msg "%username%" "10bit build failed"
+  exit 1
+)
+if not exist x265-static-main12.lib (
+  msg "%username%" "12bit build failed"
+  exit 1
+)
+cmake -G "Visual Studio 17 2022" ../../../source -DEXTRA_LIB="x265-static-main10.lib;x265-static-main12.lib" -DLINKED_10BIT=ON -DLINKED_12BIT=ON
+if exist x265.sln (
+  MSBuild /property:Configuration="Release" x265.sln
+  :: combine static libraries (ignore warnings caused by winxp.cpp hacks)
+  move Release\x265-static.lib x265-static-main.lib
+  LIB.EXE /ignore:4006 /ignore:4221 /OUT:Release\x265-static.lib x265-static-main.lib x265-static-main10.lib x265-static-main12.lib
+)
+
+pause
\ No newline at end of file
​

x265_3.6.tar.gz/doc/reST/api.rst -> x265_4.0.tar.gz/doc/reST/api.rst Changed

@@ -419,21 +419,21 @@
 	void x265_cleanup(void);
 
 VMAF (Video Multi-Method Assessment Fusion)
-==========================================
+===========================================
 
 If you set the ENABLE_LIBVMAF cmake option to ON, then x265 will report per frame
 and aggregate VMAF score for the given input and dump the scores in csv file.
-The user also need to specify the :option:`--recon` in command line to get the VMAF scores.
+The user also need to specify the :option:`--recon` in command line to get the VMAF scores.::
  
     /* x265_calculate_vmafScore:
-     *    returns VMAF score for the input video.
-     *    This api must be called only after encoding was done. */
-    double x265_calculate_vmafscore(x265_param*, x265_vmaf_data*);
+	 *       returns VMAF score for the input video.
+	 *       This API must be called only after encoding was done. */
+	double x265_calculate_vmafscore(x265_param*, x265_vmaf_data*);
 
     /* x265_calculate_vmaf_framelevelscore:
-     *    returns VMAF score for each frame in a given input video. The frame level VMAF score does not include temporal scores. */
-    double x265_calculate_vmaf_framelevelscore(x265_vmaf_framedata*);
-    
+	 *       returns VMAF score for each frame in a given input video. The frame level VMAF score does not include temporal scores. */
+	double x265_calculate_vmaf_framelevelscore(x265_vmaf_framedata*);
+
 .. Note::
 
     When setting ENABLE_LIBVMAF cmake option to ON, it is recommended to

 
@@ -419,21 +419,21 @@
    void x265_cleanup(void);
 
 VMAF (Video Multi-Method Assessment Fusion)
-==========================================
+===========================================
 
 If you set the ENABLE_LIBVMAF cmake option to ON, then x265 will report per frame
 and aggregate VMAF score for the given input and dump the scores in csv file.
-The user also need to specify the :option:`--recon` in command line to get the VMAF scores.
+The user also need to specify the :option:`--recon` in command line to get the VMAF scores.::
  
     /* x265_calculate_vmafScore:
-     *    returns VMAF score for the input video.
-     *    This api must be called only after encoding was done. */
-    double x265_calculate_vmafscore(x265_param*, x265_vmaf_data*);
+    *       returns VMAF score for the input video.
+    *       This API must be called only after encoding was done. */
+   double x265_calculate_vmafscore(x265_param*, x265_vmaf_data*);
 
     /* x265_calculate_vmaf_framelevelscore:
-     *    returns VMAF score for each frame in a given input video. The frame level VMAF score does not include temporal scores. */
-    double x265_calculate_vmaf_framelevelscore(x265_vmaf_framedata*);
-    
+    *       returns VMAF score for each frame in a given input video. The frame level VMAF score does not include temporal scores. */
+   double x265_calculate_vmaf_framelevelscore(x265_vmaf_framedata*);
+
 .. Note::
 
     When setting ENABLE_LIBVMAF cmake option to ON, it is recommended to
​

x265_3.6.tar.gz/doc/reST/cli.rst -> x265_4.0.tar.gz/doc/reST/cli.rst Changed

@@ -822,7 +822,7 @@
 	metrics from the 4 sub-CUs. When multiple inter modes like :option:`--rect`
 	and/or :option:`--amp` are enabled, this feature will use motion cost 
 	heuristics from the 4 sub-CUs to bypass modes that are unlikely to be the 
-	best choice. This can significantly improve performance when :option:`rect`
+	best choice. This can significantly improve performance when :option:`--rect`
 	and/or :option:`--amp` are enabled at minimal compression efficiency loss.
 
 .. option:: --rect, --no-rect
@@ -983,7 +983,7 @@
     Store/normalize ctu distortion in analysis-save/load.
     0 - Disabled.
     1 - Save ctu distortion to the analysis file specified during :option:`--analysis-save`.
-        Load CTU distortion from the analysis file and normalize it across every frame during :option:`--analysis-load`.
+    - Load CTU distortion from the analysis file and normalize it across every frame during :option:`--analysis-load`.
     Default 0.
 
 .. option:: --scale-factor
@@ -1056,27 +1056,13 @@
 
 .. option:: --rdoq-level <0|1|2>, --no-rdoq-level
 
-	Specify the amount of rate-distortion analysis to use within
-	quantization::
+	Specify the amount of rate-distortion analysis to use within quantization::
 
-	At level 0 rate-distortion cost is not considered in quant
-	
-	At level 1 rate-distortion cost is used to find optimal rounding
-	values for each level (and allows psy-rdoq to be effective). It
-	trades-off the signaling cost of the coefficient vs its post-inverse
-	quant distortion from the pre-quant coefficient. When
-	:option:`--psy-rdoq` is enabled, this formula is biased in favor of
-	more energy in the residual (larger coefficient absolute levels)
-	
-	At level 2 rate-distortion cost is used to make decimate decisions
-	on each 4x4 coding group, including the cost of signaling the group
-	within the group bitmap. If the total distortion of not signaling
-	the entire coding group is less than the rate cost, the block is
-	decimated. Next, it applies rate-distortion cost analysis to the
-	last non-zero coefficient, which can result in many (or all) of the
-	coding groups being decimated. Psy-rdoq is less effective at
-	preserving energy when RDOQ is at level 2, since it only has
-	influence over the level distortion costs.
+			At level 0 rate-distortion cost is not considered in quant.
+
+			At level 1 rate-distortion cost is used to find optimal rounding values for each level (and allows psy-rdoq to be effective). It trades-off the signaling cost of the coefficient vs its post-inverse quant distortion from the pre-quant coefficient. When :option:`--psy-rdoq` is enabled, this formula is biased in favor of more energy in the residual (larger coefficient absolute levels).
+
+			At level 2 rate-distortion cost is used to make decimate decisions on each 4x4 coding group, including the cost of signaling the group within the group bitmap. If the total distortion of not signaling the entire coding group is less than the rate cost, the block is decimated. Next, it applies rate-distortion cost analysis to the last non-zero coefficient, which can result in many (or all) of the coding groups being decimated. Psy-rdoq is less effective at preserving energy when RDOQ is at level 2, since it only has influence over the level distortion costs.
 
 .. option:: --tu-intra-depth <1..4>
 
@@ -1221,19 +1207,16 @@
 
 .. option:: --me <integer|string>
 
-	Motion search method. Generally, the higher the number the harder
-	the ME method will try to find an optimal match. Diamond search is
-	the simplest. Hexagon search is a little better. Uneven
-	Multi-Hexagon is an adaption of the search method used by x264 for
-	slower presets. Star is a three-step search adapted from the HM
-	encoder: a star-pattern search followed by an optional radix scan
-	followed by an optional star-search refinement. Full is an
-	exhaustive search; an order of magnitude slower than all other
-	searches but not much better than umh or star. SEA is similar to
-	x264's ESA implementation and a speed optimization of full search.
-    It is a three-step motion search where the DC calculation is
-    followed by ADS calculation followed by SAD of the passed motion
-    vector candidates.
+	Motion search method. Generally, the higher the number the harder the ME method
+	will try to find an optimal match. Diamond search is the simplest. Hexagon search
+	is a little better. Uneven Multi-Hexagon is an adaption of the search method used
+	by x264 for slower presets. Star is a three-step search adapted from the HM encoder: a
+	star-pattern search followed by an optional radix scan followed by an optional
+	star-search refinement. Full is an exhaustive search; an order of magnitude slower
+	than all other searches but not much better than umh or star. SEA is similar to x264's
+	ESA implementation and a speed optimization of full search. It is a three-step motion
+	search where the DC calculation is followed by ADS calculation followed by SAD of the
+	passed motion vector candidates.
 
 	0. dia
 	1. hex **(default)**
@@ -1331,7 +1314,14 @@
 	
 .. option:: --mcstf, --no-mcstf
 
-    Enable Motion Compensated Temporal filtering.
+	Motion-compensated spatio-temporal filtering (MCSTF) improves the compression
+	efficiency of videos that contain a high level of noise. It introduces a
+	temporal filter before encoding and this filter is applied only to the I- and P-frames.
+	It utilizes previously generated motion vectors across different video content
+	resolutions to find the best temporal correspondence for low-pass filtering. Here,
+	motion estimation is applied between the central picture and each future or past
+	picture, thereby generating multiple motion-compensated predictions, which are then
+	combined by using adaptive filtering to produce a final noise-reduced picture.
 	Default: disabled
 
 Spatial/intra options
@@ -1486,7 +1476,7 @@
 	whereas for the :option:`--scenecut`, inserts RADL at every scenecut.
 	Recommended value is 2-3. Default 0 (disabled).
 	
-	**Range of values: Between 0 and `--bframes`
+	**Range of values:** Between 0 and `--bframes`
 
 .. option:: --ctu-info <0, 1, 2, 4, 6>
 
@@ -1550,9 +1540,7 @@
 	as *lslices*
 
 	**Values:** 0 - disabled. 1 is the same as 0. Max 16.
-	Default: 8 for ultrafast, superfast, faster, fast, medium
-			 4 for slow, slower
-			 disabled for veryslow, slower
+	Default: 8 for ultrafast, superfast, faster, fast, medium; 4 for slow, slower; disabled for veryslow, slower.
 
 .. option:: --lookahead-threads <integer>
 
@@ -1602,14 +1590,17 @@
 
 	Values:
 	0 - flush the encoder only when all the input pictures are over.
-	1 - flush all the frames even when the input is not over. 
-	    slicetype decision may change with this option.
+	1 - flush all the frames even when the input is not over. Slicetype decision may change with this option.
 	2 - flush the slicetype decided frames only.   
 
 .. option:: --fades, --no-fades
 
 	Detect and handle fade-in regions. Default disabled.
 
+.. option:: --cra-nal
+
+	Force NAL type to CRA to all the frames expect for the first frame, works only with :option:`--keyint` is 1.
+
 Quality, rate control and rate distortion options
 =================================================
 
@@ -1744,9 +1735,7 @@
 	0. disabled
 	1. AQ enabled 
 	2. AQ enabled with auto-variance **(default)**
-	3. AQ enabled with auto-variance and bias to dark scenes. This is 
-	recommended for 8-bit encodes or low-bitrate 10-bit encodes, to 
-	prevent color banding/blocking. 
+	3. AQ enabled with auto-variance and bias to dark scenes. This is recommended for 8-bit encodes or low-bitrate 10-bit encodes, to prevent color banding/blocking.
 	4. AQ enabled with auto-variance and edge information.
 
 .. option:: --aq-strength <float>
@@ -1759,11 +1748,13 @@
 	Default 1.0.
 	**Range of values:** 0.0 to 3.0
 
-.. option:: --sbrc --no-sbrc
+.. option:: --sbrc, --no-sbrc
+
+	To enable and disable segment-based rate control. SBRC controls the overflow with
+	segment sizes, and it is based on the Capped CRF mode. Segment duration depends on
+	the keyframe interval specified. If unspecified, the default keyframe interval will
+	be used. Default: disabled. **Experimental Feature**
 
-	To enable and disable segment based rate control.Segment duration depends on the
-	keyframe interval specified.If unspecified,default keyframe interval will be used.
-	Default: disabled.
 
 .. option:: --hevc-aq
 
@@ -1849,7 +1840,7 @@
 	and also redundant steps are skipped.
 	In pass 1 analysis information like motion vector, depth, reference and prediction
 	modes of the final best CTU partition is stored for each CTU.
-	Multipass analysis refinement cannot be enabled when :option:`--analysis-save`/:option:`analysis-load`
+	Multipass analysis refinement cannot be enabled when :option:`--analysis-save`/:option:`--analysis-load`
 	is enabled and both will be disabled when enabled together. This feature requires :option:`--pmode`/:option:`--pme`
 	to be disabled and hence pmode/pme will be disabled when enabled at the same time.
 
@@ -2014,26 +2005,29 @@
 	When :option:`--scenecut-aware-qp` is:
 
 	* 1 (Forward masking):
-	--masking-strength <fwdMaxWindow,fwdRefQPDelta,fwdNonRefQPDelta>
-	or 
-	--masking-strength <fwdWindow1,fwdRefQPDelta1,fwdNonRefQPDelta1,fwdWindow2,fwdRefQPDelta2,fwdNonRefQPDelta2,
-						fwdWindow3,fwdRefQPDelta3,fwdNonRefQPDelta3,fwdWindow4,fwdRefQPDelta4,fwdNonRefQPDelta4,
-						fwdWindow5,fwdRefQPDelta5,fwdNonRefQPDelta5,fwdWindow6,fwdRefQPDelta6,fwdNonRefQPDelta6>
+
+			--masking-strength <fwdMaxWindow,fwdRefQPDelta,fwdNonRefQPDelta>
+
+			or
+
+			--masking-strength <fwdWindow1,fwdRefQPDelta1,fwdNonRefQPDelta1,fwdWindow2,fwdRefQPDelta2,fwdNonRefQPDelta2,fwdWindow3,fwdRefQPDelta3,fwdNonRefQPDelta3,fwdWindow4,fwdRefQPDelta4,fwdNonRefQPDelta4,fwdWindow5,fwdRefQPDelta5,fwdNonRefQPDelta5,fwdWindow6,fwdRefQPDelta6,fwdNonRefQPDelta6>
+
 	* 2 (Backward masking):
-	--masking-strength <bwdMaxWindow,bwdRefQPDelta,bwdNonRefQPDelta>
-	or 
-	--masking-strength <bwdWindow1,bwdRefQPDelta1,bwdNonRefQPDelta1,bwdWindow2,bwdRefQPDelta2,bwdNonRefQPDelta2,
-						bwdWindow3,bwdRefQPDelta3,bwdNonRefQPDelta3,bwdWindow4,bwdRefQPDelta4,bwdNonRefQPDelta4,
-						bwdWindow5,bwdRefQPDelta5,bwdNonRefQPDelta5,bwdWindow6,bwdRefQPDelta6,bwdNonRefQPDelta6>
+
+			--masking-strength <bwdMaxWindow,bwdRefQPDelta,bwdNonRefQPDelta>

 
@@ -822,7 +822,7 @@
    metrics from the 4 sub-CUs. When multiple inter modes like :option:`--rect`
    and/or :option:`--amp` are enabled, this feature will use motion cost 
    heuristics from the 4 sub-CUs to bypass modes that are unlikely to be the 
-   best choice. This can significantly improve performance when :option:`rect`
+   best choice. This can significantly improve performance when :option:`--rect`
    and/or :option:`--amp` are enabled at minimal compression efficiency loss.
 
 .. option:: --rect, --no-rect
@@ -983,7 +983,7 @@
     Store/normalize ctu distortion in analysis-save/load.
     0 - Disabled.
     1 - Save ctu distortion to the analysis file specified during :option:`--analysis-save`.
-        Load CTU distortion from the analysis file and normalize it across every frame during :option:`--analysis-load`.
+    - Load CTU distortion from the analysis file and normalize it across every frame during :option:`--analysis-load`.
     Default 0.
 
 .. option:: --scale-factor
@@ -1056,27 +1056,13 @@
 
 .. option:: --rdoq-level <0|1|2>, --no-rdoq-level
 
-   Specify the amount of rate-distortion analysis to use within
-   quantization::
+   Specify the amount of rate-distortion analysis to use within quantization::
 
-   At level 0 rate-distortion cost is not considered in quant
-   
-   At level 1 rate-distortion cost is used to find optimal rounding
-   values for each level (and allows psy-rdoq to be effective). It
-   trades-off the signaling cost of the coefficient vs its post-inverse
-   quant distortion from the pre-quant coefficient. When
-   :option:`--psy-rdoq` is enabled, this formula is biased in favor of
-   more energy in the residual (larger coefficient absolute levels)
-   
-   At level 2 rate-distortion cost is used to make decimate decisions
-   on each 4x4 coding group, including the cost of signaling the group
-   within the group bitmap. If the total distortion of not signaling
-   the entire coding group is less than the rate cost, the block is
-   decimated. Next, it applies rate-distortion cost analysis to the
-   last non-zero coefficient, which can result in many (or all) of the
-   coding groups being decimated. Psy-rdoq is less effective at
-   preserving energy when RDOQ is at level 2, since it only has
-   influence over the level distortion costs.
+           At level 0 rate-distortion cost is not considered in quant.
+
+           At level 1 rate-distortion cost is used to find optimal rounding values for each level (and allows psy-rdoq to be effective). It trades-off the signaling cost of the coefficient vs its post-inverse quant distortion from the pre-quant coefficient. When :option:`--psy-rdoq` is enabled, this formula is biased in favor of more energy in the residual (larger coefficient absolute levels).
+
+           At level 2 rate-distortion cost is used to make decimate decisions on each 4x4 coding group, including the cost of signaling the group within the group bitmap. If the total distortion of not signaling the entire coding group is less than the rate cost, the block is decimated. Next, it applies rate-distortion cost analysis to the last non-zero coefficient, which can result in many (or all) of the coding groups being decimated. Psy-rdoq is less effective at preserving energy when RDOQ is at level 2, since it only has influence over the level distortion costs.
 
 .. option:: --tu-intra-depth <1..4>
 
@@ -1221,19 +1207,16 @@
 
 .. option:: --me <integer|string>
 
-   Motion search method. Generally, the higher the number the harder
-   the ME method will try to find an optimal match. Diamond search is
-   the simplest. Hexagon search is a little better. Uneven
-   Multi-Hexagon is an adaption of the search method used by x264 for
-   slower presets. Star is a three-step search adapted from the HM
-   encoder: a star-pattern search followed by an optional radix scan
-   followed by an optional star-search refinement. Full is an
-   exhaustive search; an order of magnitude slower than all other
-   searches but not much better than umh or star. SEA is similar to
-   x264's ESA implementation and a speed optimization of full search.
-    It is a three-step motion search where the DC calculation is
-    followed by ADS calculation followed by SAD of the passed motion
-    vector candidates.
+   Motion search method. Generally, the higher the number the harder the ME method
+   will try to find an optimal match. Diamond search is the simplest. Hexagon search
+   is a little better. Uneven Multi-Hexagon is an adaption of the search method used
+   by x264 for slower presets. Star is a three-step search adapted from the HM encoder: a
+   star-pattern search followed by an optional radix scan followed by an optional
+   star-search refinement. Full is an exhaustive search; an order of magnitude slower
+   than all other searches but not much better than umh or star. SEA is similar to x264's
+   ESA implementation and a speed optimization of full search. It is a three-step motion
+   search where the DC calculation is followed by ADS calculation followed by SAD of the
+   passed motion vector candidates.
 
    0. dia
    1. hex **(default)**
@@ -1331,7 +1314,14 @@
    
 .. option:: --mcstf, --no-mcstf
 
-    Enable Motion Compensated Temporal filtering.
+   Motion-compensated spatio-temporal filtering (MCSTF) improves the compression
+   efficiency of videos that contain a high level of noise. It introduces a
+   temporal filter before encoding and this filter is applied only to the I- and P-frames.
+   It utilizes previously generated motion vectors across different video content
+   resolutions to find the best temporal correspondence for low-pass filtering. Here,
+   motion estimation is applied between the central picture and each future or past
+   picture, thereby generating multiple motion-compensated predictions, which are then
+   combined by using adaptive filtering to produce a final noise-reduced picture.
    Default: disabled
 
 Spatial/intra options
@@ -1486,7 +1476,7 @@
    whereas for the :option:`--scenecut`, inserts RADL at every scenecut.
    Recommended value is 2-3. Default 0 (disabled).
    
-   **Range of values: Between 0 and `--bframes`
+   **Range of values:** Between 0 and `--bframes`
 
 .. option:: --ctu-info <0, 1, 2, 4, 6>
 
@@ -1550,9 +1540,7 @@
    as *lslices*
 
    **Values:** 0 - disabled. 1 is the same as 0. Max 16.
-   Default: 8 for ultrafast, superfast, faster, fast, medium
-            4 for slow, slower
-            disabled for veryslow, slower
+   Default: 8 for ultrafast, superfast, faster, fast, medium; 4 for slow, slower; disabled for veryslow, slower.
 
 .. option:: --lookahead-threads <integer>
 
@@ -1602,14 +1590,17 @@
 
    Values:
    0 - flush the encoder only when all the input pictures are over.
-   1 - flush all the frames even when the input is not over. 
-       slicetype decision may change with this option.
+   1 - flush all the frames even when the input is not over. Slicetype decision may change with this option.
    2 - flush the slicetype decided frames only.   
 
 .. option:: --fades, --no-fades
 
    Detect and handle fade-in regions. Default disabled.
 
+.. option:: --cra-nal
+
+   Force NAL type to CRA to all the frames expect for the first frame, works only with :option:`--keyint` is 1.
+
 Quality, rate control and rate distortion options
 =================================================
 
@@ -1744,9 +1735,7 @@
    0. disabled
    1. AQ enabled 
    2. AQ enabled with auto-variance **(default)**
-   3. AQ enabled with auto-variance and bias to dark scenes. This is 
-   recommended for 8-bit encodes or low-bitrate 10-bit encodes, to 
-   prevent color banding/blocking. 
+   3. AQ enabled with auto-variance and bias to dark scenes. This is recommended for 8-bit encodes or low-bitrate 10-bit encodes, to prevent color banding/blocking.
    4. AQ enabled with auto-variance and edge information.
 
 .. option:: --aq-strength <float>
@@ -1759,11 +1748,13 @@
    Default 1.0.
    **Range of values:** 0.0 to 3.0
 
-.. option:: --sbrc --no-sbrc
+.. option:: --sbrc, --no-sbrc
+
+   To enable and disable segment-based rate control. SBRC controls the overflow with
+   segment sizes, and it is based on the Capped CRF mode. Segment duration depends on
+   the keyframe interval specified. If unspecified, the default keyframe interval will
+   be used. Default: disabled. **Experimental Feature**
 
-   To enable and disable segment based rate control.Segment duration depends on the
-   keyframe interval specified.If unspecified,default keyframe interval will be used.
-   Default: disabled.
 
 .. option:: --hevc-aq
 
@@ -1849,7 +1840,7 @@
    and also redundant steps are skipped.
    In pass 1 analysis information like motion vector, depth, reference and prediction
    modes of the final best CTU partition is stored for each CTU.
-   Multipass analysis refinement cannot be enabled when :option:`--analysis-save`/:option:`analysis-load`
+   Multipass analysis refinement cannot be enabled when :option:`--analysis-save`/:option:`--analysis-load`
    is enabled and both will be disabled when enabled together. This feature requires :option:`--pmode`/:option:`--pme`
    to be disabled and hence pmode/pme will be disabled when enabled at the same time.
 
@@ -2014,26 +2005,29 @@
    When :option:`--scenecut-aware-qp` is:
 
    * 1 (Forward masking):
-   --masking-strength <fwdMaxWindow,fwdRefQPDelta,fwdNonRefQPDelta>
-   or 
-   --masking-strength <fwdWindow1,fwdRefQPDelta1,fwdNonRefQPDelta1,fwdWindow2,fwdRefQPDelta2,fwdNonRefQPDelta2,
-                       fwdWindow3,fwdRefQPDelta3,fwdNonRefQPDelta3,fwdWindow4,fwdRefQPDelta4,fwdNonRefQPDelta4,
-                       fwdWindow5,fwdRefQPDelta5,fwdNonRefQPDelta5,fwdWindow6,fwdRefQPDelta6,fwdNonRefQPDelta6>
+
+           --masking-strength <fwdMaxWindow,fwdRefQPDelta,fwdNonRefQPDelta>
+
+           or
+
+           --masking-strength <fwdWindow1,fwdRefQPDelta1,fwdNonRefQPDelta1,fwdWindow2,fwdRefQPDelta2,fwdNonRefQPDelta2,fwdWindow3,fwdRefQPDelta3,fwdNonRefQPDelta3,fwdWindow4,fwdRefQPDelta4,fwdNonRefQPDelta4,fwdWindow5,fwdRefQPDelta5,fwdNonRefQPDelta5,fwdWindow6,fwdRefQPDelta6,fwdNonRefQPDelta6>
+
    * 2 (Backward masking):
-   --masking-strength <bwdMaxWindow,bwdRefQPDelta,bwdNonRefQPDelta>
-   or 
-   --masking-strength <bwdWindow1,bwdRefQPDelta1,bwdNonRefQPDelta1,bwdWindow2,bwdRefQPDelta2,bwdNonRefQPDelta2,
-                       bwdWindow3,bwdRefQPDelta3,bwdNonRefQPDelta3,bwdWindow4,bwdRefQPDelta4,bwdNonRefQPDelta4,
-                       bwdWindow5,bwdRefQPDelta5,bwdNonRefQPDelta5,bwdWindow6,bwdRefQPDelta6,bwdNonRefQPDelta6>
+
+           --masking-strength <bwdMaxWindow,bwdRefQPDelta,bwdNonRefQPDelta>
​

x265_3.6.tar.gz/doc/reST/conf.py -> x265_4.0.tar.gz/doc/reST/conf.py Changed

 
@@ -14,7 +14,7 @@
 copyright = u'2014 MulticoreWare Inc'
 
 # -- Options for HTML output ---------------------------------------------------
-html_theme = "default"
+html_theme = "sphinx_rtd_theme"
 
 # One entry per manual page. List of tuples
 # (source start file, name, description, authors, manual section).
​

x265_3.6.tar.gz/doc/reST/presets.rst -> x265_4.0.tar.gz/doc/reST/presets.rst Changed

@@ -21,16 +21,17 @@
 The presets adjust encoder parameters as shown in the following table.
 Any parameters below that are specified in your command-line will be 
 changed from the value specified by the preset.
-	0. ultrafast
-	1. superfast
-	2. veryfast
-	3. faster
-	4. fast
-	5. medium **(default)**
-	6. slow
-	7. slower
-	8. veryslow
-	9. placebo
+
+    0. ultrafast
+    1. superfast
+    2. veryfast
+    3. faster
+    4. fast
+    5. medium **(default)**
+    6. slow
+    7. slower
+    8. veryslow
+    9. placebo
 
 +-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
 | preset          |  0  |  1  |  2  |   3 |   4 |   5 |   6  |   7  |   8  |  9   |
@@ -152,7 +153,7 @@
     * :option:`--sao` 0
     * :option:`--psy-rd` 4.0
     * :option:`--psy-rdoq` 10.0
-    * :option:`--recursion-skip` 0
+    * :option:`--rskip` 0
     
 It also enables a specialised ratecontrol algorithm :option:`--rc-grain` 
 that strictly minimises QP fluctuations across frames, while still allowing

 
@@ -21,16 +21,17 @@
 The presets adjust encoder parameters as shown in the following table.
 Any parameters below that are specified in your command-line will be 
 changed from the value specified by the preset.
-   0. ultrafast
-   1. superfast
-   2. veryfast
-   3. faster
-   4. fast
-   5. medium **(default)**
-   6. slow
-   7. slower
-   8. veryslow
-   9. placebo
+
+    0. ultrafast
+    1. superfast
+    2. veryfast
+    3. faster
+    4. fast
+    5. medium **(default)**
+    6. slow
+    7. slower
+    8. veryslow
+    9. placebo
 
 +-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
 | preset          |  0  |  1  |  2  |   3 |   4 |   5 |   6  |   7  |   8  |  9   |
@@ -152,7 +153,7 @@
     * :option:`--sao` 0
     * :option:`--psy-rd` 4.0
     * :option:`--psy-rdoq` 10.0
-    * :option:`--recursion-skip` 0
+    * :option:`--rskip` 0
     
 It also enables a specialised ratecontrol algorithm :option:`--rc-grain` 
 that strictly minimises QP fluctuations across frames, while still allowing 
​

x265_3.6.tar.gz/doc/reST/releasenotes.rst -> x265_4.0.tar.gz/doc/reST/releasenotes.rst Changed

@@ -2,6 +2,44 @@
 Release Notes
 *************
 
+Version 4.0
+===========
+
+Release date - 13th September, 2024.
+
+New feature
+-----------
+1. Alpha Channel feature.
+2. Screen Content Coding (SCC).
+3. MV-HEVC feature.
+
+Enhancements to existing features
+---------------------------------
+1. Added support for the VMAF v3.x.
+
+API changes
+-----------
+1. Add command line parameter for Alpha Channel feature :option:`--alpha`.
+2. Add command line parameter for SCC feature :option:`--scc 1`.
+3. Add command line parameters for the MV-HEVC feature :option:`--multiview-config "multiview_config.txt"`.
+
+Optimizations
+---------------------
+1. Arm SIMD optimizations: Several time-consuming scalar C functions now have SIMD implementations on Arm platforms. Existing Arm SIMD implementations have also been optimized. These optimizations result in up to 57% faster encoding compared to release 3.6.
+2. Arm SIMD optimizations include use of Armv8.4 DotProd, Armv8.6 I8MM, and Armv9 SVE2 instruction set extensions. The following algorithms now have optimized SIMD implementations: SAD, SSE, DCT, SAO, convolution, quantization, intra_planar, intraFilter, intrapred DC and IDCT16x16.
+
+Bug fixes
+---------
+1. Fix for y4m pipe input broken.
+2. Fix SCC crash on multipass encode.
+3. Fix mcstf when :option:`--bframes` value was less than 5.
+4. Fix lowpass DCT for high bit depth.
+5. Added build support for Visual Studio 17.
+6. Fix issue in default code flow and memory leak.
+7. Framethreads tuning for Windows ARM devices.
+8. Fix scc crash on multipass encode.
+
+
 Version 3.6
 ===========
 
@@ -9,44 +47,44 @@
 
 New feature
 -----------
-1. Segment based Ratecontrol (SBRC) feature
-2. Motion-Compensated Spatio-Temporal Filtering
-3. Scene-cut aware qp - BBAQ (Bidirectional Boundary Aware Quantization)
-4. Histogram-Based Scene Change Detection
-5. Film-Grain characteristics as a SEI message to support Film Grain Synthesis(FGS)
-6. Add temporal layer implementation(Hierarchical B-frame implementation)
- 
+1. Segment based Ratecontrol (SBRC) feature.
+2. Motion-Compensated Spatio-Temporal Filtering.
+3. Scene-cut aware qp - BBAQ (Bidirectional Boundary Aware Quantization).
+4. Histogram-Based Scene Change Detection.
+5. Film-Grain characteristics as a SEI message to support Film Grain Synthesis (FGS).
+6. Add temporal layer implementation (Hierarchical B-frame implementation).
+
 Enhancements to existing features
 ---------------------------------
-1. Added Dolby Vision 8.4 Profile Support
+1. Added Dolby Vision 8.4 Profile Support.
 
 
 API changes
 -----------
-1. Add Segment based Ratecontrol(SBRC) feature: "--no-sbrc".
-2. Add command line parameter for mcstf feature: "--no-mctf".
-3. Add command line parameters for the scene cut aware qp feature: "--scenecut-aware-qp" and "--masking-strength".
-4. Add command line parameters for Histogram-Based Scene Change Detection: "--hist-scenecut".
-5. Add film grain characteristics as a SEI message to the bitstream: "--film-grain <filename>"
-6. cli: add new option --cra-nal (Force nal type to CRA to all frames expect for the first frame, works only with keyint 1)
+1. Add command line parameter for SBRC feature :option:`--sbrc`.
+2. Add command line parameter for mcstf feature :option:`--mcstf`.
+3. Add command line parameters for the scene cut aware qp feature :option:`--scenecut-aware-qp` and :option:`--masking-strength`.
+4. Add command line parameters for Histogram-Based Scene Change Detection :option:`--hist-scenecut`.
+5. Add command line parameters for film grain characteristics as a SEI message to the bitstream :option:`--film-grain`.
+6. cli: add new option :option:`--cra-nal` (Force NAL type to CRA to all the frames expect for the first frame, works only with :option:`--keyint` is 1).
 
 Optimizations
 ---------------------
-ARM64 NEON optimizations:- Several time-consuming C functions have been optimized for the targeted platform - aarch64. The overall performance increased by around 20%.
-SVE/SVE2 optimizations
+1. ARM64 NEON optimizations:- Several time-consuming C functions have been optimized for the targeted platform - aarch64. The overall performance increased by around 20%.
+2. SVE/SVE2 optimizations.
 
 
 Bug fixes
 ---------
-1. Linux bug to utilize all the cores
-2. Crash with hist-scenecut build when source resolution is not multiple of minCuSize
-3. 32bit and 64bit builds generation for ARM
-4. bugs in zonefile feature (Reflect Zonefile Parameters inside Lookahead, extra IDR issue, Avg I Slice QP value issue etc..)
-5. Add x86 ASM implementation for subsampling luma 
-6. Fix for abrladder segfault with load reuse level 1 
-7. Reorder miniGOP based on temporal layer hierarchy and add support for more B frame 
-8. Add MacOS aarch64 build support 
-9. Fix boundary condition issue for Gaussian filter
+1. Linux bug to utilize all the cores.
+2. Crash with hist-scenecut build when source resolution is not multiple of minCuSize.
+3. 32bit and 64bit builds generation for ARM.
+4. bugs in zonefile feature (Reflect Zonefile Parameters inside Lookahead, extra IDR issue, Avg I Slice QP value issue etc.).
+5. Add x86 ASM implementation for subsampling luma.
+6. Fix for abrladder segfault with load reuse level 1.
+7. Reorder miniGOP based on temporal layer hierarchy and add support for more B frames. 
+8. Add MacOS aarch64 build support.
+9. Fix boundary condition issue for Gaussian filter.
 
 
 Version 3.5

 
@@ -2,6 +2,44 @@
 Release Notes
 *************
 
+Version 4.0
+===========
+
+Release date - 13th September, 2024.
+
+New feature
+-----------
+1. Alpha Channel feature.
+2. Screen Content Coding (SCC).
+3. MV-HEVC feature.
+
+Enhancements to existing features
+---------------------------------
+1. Added support for the VMAF v3.x.
+
+API changes
+-----------
+1. Add command line parameter for Alpha Channel feature :option:`--alpha`.
+2. Add command line parameter for SCC feature :option:`--scc 1`.
+3. Add command line parameters for the MV-HEVC feature :option:`--multiview-config "multiview_config.txt"`.
+
+Optimizations
+---------------------
+1. Arm SIMD optimizations: Several time-consuming scalar C functions now have SIMD implementations on Arm platforms. Existing Arm SIMD implementations have also been optimized. These optimizations result in up to 57% faster encoding compared to release 3.6.
+2. Arm SIMD optimizations include use of Armv8.4 DotProd, Armv8.6 I8MM, and Armv9 SVE2 instruction set extensions. The following algorithms now have optimized SIMD implementations: SAD, SSE, DCT, SAO, convolution, quantization, intra_planar, intraFilter, intrapred DC and IDCT16x16.
+
+Bug fixes
+---------
+1. Fix for y4m pipe input broken.
+2. Fix SCC crash on multipass encode.
+3. Fix mcstf when :option:`--bframes` value was less than 5.
+4. Fix lowpass DCT for high bit depth.
+5. Added build support for Visual Studio 17.
+6. Fix issue in default code flow and memory leak.
+7. Framethreads tuning for Windows ARM devices.
+8. Fix scc crash on multipass encode.
+
+
 Version 3.6
 ===========
 
@@ -9,44 +47,44 @@
 
 New feature
 -----------
-1. Segment based Ratecontrol (SBRC) feature
-2. Motion-Compensated Spatio-Temporal Filtering
-3. Scene-cut aware qp - BBAQ (Bidirectional Boundary Aware Quantization)
-4. Histogram-Based Scene Change Detection
-5. Film-Grain characteristics as a SEI message to support Film Grain Synthesis(FGS)
-6. Add temporal layer implementation(Hierarchical B-frame implementation)
- 
+1. Segment based Ratecontrol (SBRC) feature.
+2. Motion-Compensated Spatio-Temporal Filtering.
+3. Scene-cut aware qp - BBAQ (Bidirectional Boundary Aware Quantization).
+4. Histogram-Based Scene Change Detection.
+5. Film-Grain characteristics as a SEI message to support Film Grain Synthesis (FGS).
+6. Add temporal layer implementation (Hierarchical B-frame implementation).
+
 Enhancements to existing features
 ---------------------------------
-1. Added Dolby Vision 8.4 Profile Support
+1. Added Dolby Vision 8.4 Profile Support.
 
 
 API changes
 -----------
-1. Add Segment based Ratecontrol(SBRC) feature: "--no-sbrc".
-2. Add command line parameter for mcstf feature: "--no-mctf".
-3. Add command line parameters for the scene cut aware qp feature: "--scenecut-aware-qp" and "--masking-strength".
-4. Add command line parameters for Histogram-Based Scene Change Detection: "--hist-scenecut".
-5. Add film grain characteristics as a SEI message to the bitstream: "--film-grain <filename>"
-6. cli: add new option --cra-nal (Force nal type to CRA to all frames expect for the first frame, works only with keyint 1)
+1. Add command line parameter for SBRC feature :option:`--sbrc`.
+2. Add command line parameter for mcstf feature :option:`--mcstf`.
+3. Add command line parameters for the scene cut aware qp feature :option:`--scenecut-aware-qp` and :option:`--masking-strength`.
+4. Add command line parameters for Histogram-Based Scene Change Detection :option:`--hist-scenecut`.
+5. Add command line parameters for film grain characteristics as a SEI message to the bitstream :option:`--film-grain`.
+6. cli: add new option :option:`--cra-nal` (Force NAL type to CRA to all the frames expect for the first frame, works only with :option:`--keyint` is 1).
 
 Optimizations
 ---------------------
-ARM64 NEON optimizations:- Several time-consuming C functions have been optimized for the targeted platform - aarch64. The overall performance increased by around 20%.
-SVE/SVE2 optimizations
+1. ARM64 NEON optimizations:- Several time-consuming C functions have been optimized for the targeted platform - aarch64. The overall performance increased by around 20%.
+2. SVE/SVE2 optimizations.
 
 
 Bug fixes
 ---------
-1. Linux bug to utilize all the cores
-2. Crash with hist-scenecut build when source resolution is not multiple of minCuSize
-3. 32bit and 64bit builds generation for ARM
-4. bugs in zonefile feature (Reflect Zonefile Parameters inside Lookahead, extra IDR issue, Avg I Slice QP value issue etc..)
-5. Add x86 ASM implementation for subsampling luma 
-6. Fix for abrladder segfault with load reuse level 1 
-7. Reorder miniGOP based on temporal layer hierarchy and add support for more B frame 
-8. Add MacOS aarch64 build support 
-9. Fix boundary condition issue for Gaussian filter
+1. Linux bug to utilize all the cores.
+2. Crash with hist-scenecut build when source resolution is not multiple of minCuSize.
+3. 32bit and 64bit builds generation for ARM.
+4. bugs in zonefile feature (Reflect Zonefile Parameters inside Lookahead, extra IDR issue, Avg I Slice QP value issue etc.).
+5. Add x86 ASM implementation for subsampling luma.
+6. Fix for abrladder segfault with load reuse level 1.
+7. Reorder miniGOP based on temporal layer hierarchy and add support for more B frames. 
+8. Add MacOS aarch64 build support.
+9. Fix boundary condition issue for Gaussian filter.
 
 
 Version 3.5
​

x265_3.6.tar.gz/doc/reST/svthevc.rst -> x265_4.0.tar.gz/doc/reST/svthevc.rst Changed

@@ -3,7 +3,7 @@
 
 .. _SvtHevc:
 
-x265 has support for open source HEVC encoder `SVT-HEVC <https://01.org/svt>`_ 
+x265 has support for open source HEVC encoder `SVT-HEVC <https://www.intel.com/content/www/us/en/developer/articles/technical/scalable-video-technology.html>`_
 and can generate SVT-HEVC compliant bitstreams. SVT-HEVC encoder can be enabled at run time 
 using :option:`--svt`. Since SVT-HEVC params/CLI are not exposed outside, it has to be 
 configured only via x265 CLI options. The API's of SVT-HEVC are accessed through x265's API 
@@ -22,7 +22,7 @@
 
 **SVT-HEVC**
 
-1. Clone `SVT-HEVC <https://github.com/intel/SVT-HEVC>`_ (say at path "/home/app/") and build it (follow the build steps in its README file)
+1. Clone `SVT-HEVC-repo <https://github.com/intel/SVT-HEVC>`_ (say at path "/home/app/") and build it (follow the build steps in its README file)
 2. Once build is successful, binaries can be found inside the *Bin* folder at its root directory ("/home/app/SVT-HEVC/Bin/Release/")
 
 **x265**

 
@@ -3,7 +3,7 @@
 
 .. _SvtHevc:
 
-x265 has support for open source HEVC encoder `SVT-HEVC <https://01.org/svt>`_ 
+x265 has support for open source HEVC encoder `SVT-HEVC <https://www.intel.com/content/www/us/en/developer/articles/technical/scalable-video-technology.html>`_
 and can generate SVT-HEVC compliant bitstreams. SVT-HEVC encoder can be enabled at run time 
 using :option:`--svt`. Since SVT-HEVC params/CLI are not exposed outside, it has to be 
 configured only via x265 CLI options. The API's of SVT-HEVC are accessed through x265's API 
@@ -22,7 +22,7 @@
 
 **SVT-HEVC**
 
-1. Clone `SVT-HEVC <https://github.com/intel/SVT-HEVC>`_ (say at path "/home/app/") and build it (follow the build steps in its README file)
+1. Clone `SVT-HEVC-repo <https://github.com/intel/SVT-HEVC>`_ (say at path "/home/app/") and build it (follow the build steps in its README file)
 2. Once build is successful, binaries can be found inside the *Bin* folder at its root directory ("/home/app/SVT-HEVC/Bin/Release/")
 
 **x265**
​

x265_3.6.tar.gz/doc/reST/x265.rst -> x265_4.0.tar.gz/doc/reST/x265.rst Changed

 
@@ -1,3 +1,5 @@
+:orphan:
+
 x265 CLI Documentation
 ######################
 
​

x265_4.0.tar.gz/doc/requirements.txt Added

 
@@ -0,0 +1,3 @@
+sphinx
+sphinx-rtd-theme
+# Add other dependencies here
​

x265_3.6.tar.gz/source/CMakeLists.txt -> x265_4.0.tar.gz/source/CMakeLists.txt Changed

@@ -22,6 +22,8 @@
 include(CheckFunctionExists)
 include(CheckSymbolExists)
 include(CheckCXXCompilerFlag)
+include(CheckCSourceCompiles)
+include(CheckCXXSourceCompiles)
 
 option(FPROFILE_GENERATE "Compile executable to generate usage data" OFF)
 option(FPROFILE_USE "Compile executable using generated usage data" OFF)
@@ -29,7 +31,7 @@
 option(STATIC_LINK_CRT "Statically link C runtime for release builds" OFF)
 mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD)
 # X265_BUILD must be incremented each time the public API is changed
-set(X265_BUILD 209)
+set(X265_BUILD 212)
 configure_file("${PROJECT_SOURCE_DIR}/x265.def.in"
                "${PROJECT_BINARY_DIR}/x265.def")
 configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in"
@@ -80,14 +82,16 @@
     set(ARM 1)
     add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=1)
 elseif(ARM64MATCH GREATER "-1")
-    #if(CROSS_COMPILE_ARM64)
-        #message(STATUS "Cross compiling for ARM64 arch")
-    #else()
-        #set(CROSS_COMPILE_ARM64 0)
-    #endif()
     message(STATUS "Detected ARM64 target processor")
     set(ARM64 1)
-    add_definitions(-DX265_ARCH_ARM64=1 -DHAVE_NEON)
+
+    option(AARCH64_WARNINGS_AS_ERRORS "Build with -Werror for AArch64 Intrinsics files" OFF)
+
+    # Options for cross compiling AArch64 optional extensions
+    option(CROSS_COMPILE_SVE "Cross Compile for SVE Target" OFF)
+    option(CROSS_COMPILE_SVE2 "Cross Compile for SVE2 Target" OFF)
+    option(CROSS_COMPILE_NEON_DOTPROD "Cross Compile for Neon DotProd Target" OFF)
+    option(CROSS_COMPILE_NEON_I8MM "Cross Compile for Neon I8MM Target" OFF)
 else()
     message(STATUS "CMAKE_SYSTEM_PROCESSOR value `${CMAKE_SYSTEM_PROCESSOR}` is unknown")
     message(STATUS "Please add this value near ${CMAKE_CURRENT_LIST_FILE}:${CMAKE_CURRENT_LIST_LINE}")
@@ -259,28 +263,106 @@
             set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=vfp -marm)
         endif()
     endif()
-	if(ARM64 OR CROSS_COMPILE_ARM64)
-        find_package(Neon)
-        find_package(SVE)
-        find_package(SVE2)
-        if(CPU_HAS_SVE2 OR CROSS_COMPILE_SVE2)
-            message(STATUS "Found SVE2")
-	        set(ARM_ARGS -O3 -march=armv8-a+sve2 -fPIC -flax-vector-conversions)
-            add_definitions(-DHAVE_SVE2)
-            add_definitions(-DHAVE_SVE)
-            add_definitions(-DHAVE_NEON) # for NEON c/c++ primitives, as currently there is no implementation that use SVE2
-        elseif(CPU_HAS_SVE OR CROSS_COMPILE_SVE)
-            message(STATUS "Found SVE")
-	        set(ARM_ARGS -O3 -march=armv8-a+sve -fPIC -flax-vector-conversions)
-            add_definitions(-DHAVE_SVE)
-            add_definitions(-DHAVE_NEON) # for NEON c/c++ primitives, as currently there is no implementation that use SVE
-        elseif(CPU_HAS_NEON)
-            message(STATUS "Found NEON")
-            set(ARM_ARGS -fPIC -flax-vector-conversions)
-            add_definitions(-DHAVE_NEON)
+    if(ARM64)
+        message(STATUS "Found Neon")
+        set(CPU_HAS_NEON 1)
+        add_definitions(-DX265_ARCH_ARM64=1 -DHAVE_NEON=1)
+
+        if(CROSS_COMPILE_ARM64)
+            # Handle cross-compilation options.
+            if(CROSS_COMPILE_NEON_DOTPROD)
+                set(CPU_HAS_NEON_DOTPROD 1)
+            endif()
+            if(CROSS_COMPILE_NEON_I8MM)
+                set(CPU_HAS_NEON_I8MM 1)
+                # Impose the constraint that Neon I8MM implies Neon DotProd.
+                set(CPU_HAS_NEON_DOTPROD 1)
+            endif()
+            if(CROSS_COMPILE_SVE)
+                set(CPU_HAS_SVE 1)
+                # Impose the constraint that SVE implies Neon DotProd and I8MM.
+                set(CPU_HAS_NEON_DOTPROD 1)
+                set(CPU_HAS_NEON_I8MM 1)
+            endif()
+            if(CROSS_COMPILE_SVE2)
+                set(CPU_HAS_SVE2 1)
+                # SVE2 implies SVE and Neon DotProd.
+                set(CPU_HAS_SVE 1)
+                set(CPU_HAS_NEON_DOTPROD 1)
+                # Impose the constraint that SVE2 implies Neon I8MM.
+                set(CPU_HAS_NEON_I8MM 1)
+            endif()
         else()
-            set(ARM_ARGS -fPIC -flax-vector-conversions)
-        endif()        
+            if(CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin")
+                find_package(NEON_DOTPROD)
+                find_package(NEON_I8MM)
+                find_package(SVE)
+                find_package(SVE2)
+            else()
+                message(STATUS "Compile time feature detection unsupported on this platform")
+            endif()
+        endif()
+
+        if(CPU_HAS_NEON_DOTPROD)
+            # Neon DotProd is mandatory from Armv8.4.
+            message(STATUS "Found Neon DotProd")
+            set(ARM_ARGS -O3 -march=armv8.2-a+dotprod)
+            add_definitions(-DHAVE_NEON_DOTPROD=1)
+        endif()
+        if(CPU_HAS_NEON_I8MM)
+            # Neon I8MM is mandatory from Armv8.6.
+            message(STATUS "Found Neon I8MM")
+            # Impose the constraint that Neon I8MM implies Neon DotProd.
+            if(NOT CPU_HAS_NEON_DOTPROD)
+                message(FATAL_ERROR "Unsupported AArch64 feature combination (Neon I8MM without Neon DotProd)")
+            endif()
+            set(ARM_ARGS -O3 -march=armv8.2-a+dotprod+i8mm)
+            add_definitions(-DHAVE_NEON_I8MM=1)
+        endif()
+        if(CPU_HAS_SVE)
+            message(STATUS "Found SVE")
+            # Impose the constraint that SVE implies Neon I8MM.
+            if(NOT CPU_HAS_NEON_I8MM)
+                message(FATAL_ERROR "Unsupported AArch64 feature combination (SVE without Neon I8MM)")
+            endif()
+            set(ARM_ARGS -O3 -march=armv8.2-a+dotprod+i8mm+sve)
+            add_definitions(-DHAVE_SVE=1)
+        endif()
+        if(CPU_HAS_SVE2)
+            message(STATUS "Found SVE2")
+            # SVE2 is only available from Armv9.0, and armv9-a implies +dotprod
+            set(ARM_ARGS -O3 -march=armv9-a+i8mm+sve2)
+            add_definitions(-DHAVE_SVE2=1)
+        endif()
+        set(ARM_ARGS ${ARM_ARGS} -fPIC)
+        # Do not allow implicit vector type conversions in Clang builds (this
+        # is already the default in GCC builds).
+        check_cxx_compiler_flag(-flax-vector-conversions=none CC_HAS_FLAX_VEC_CONV_NONE)
+        if(CC_HAS_FLAX_VEC_CONV_NONE)
+            set(ARM_ARGS ${ARM_ARGS} -flax-vector-conversions=none)
+        endif()
+        if(CPU_HAS_SVE)
+            set(SVE_HEADER_TEST "
+#ifndef __ARM_NEON_SVE_BRIDGE
+#error 1
+#endif
+#include <arm_sve.h>
+#include <arm_neon_sve_bridge.h>
+int main() { return 0; }")
+            set(OLD_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS})
+            # CMAKE_REQUIRED_FLAGS requires a space-delimited string, whereas
+            # ARM_ARGS is defined and used elsewhere as a ;-list.
+            foreach(ARM_ARG ${ARM_ARGS})
+                string(APPEND CMAKE_REQUIRED_FLAGS " ${ARM_ARG}")
+            endforeach()
+            check_c_source_compiles("${SVE_HEADER_TEST}" SVE_HEADER_C_TEST_COMPILED)
+            check_cxx_source_compiles("${SVE_HEADER_TEST}" SVE_HEADER_CXX_TEST_COMPILED)
+            set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQUIRED_FLAGS})
+            if(SVE_HEADER_C_TEST_COMPILED AND SVE_HEADER_CXX_TEST_COMPILED)
+                add_definitions(-DHAVE_SVE_BRIDGE=1)
+                set(HAVE_SVE_BRIDGE 1)
+            endif()
+        endif()
     endif()
 	if(ENABLE_PIC)
 	list(APPEND ARM_ARGS -DPIC)
@@ -334,9 +416,11 @@
     if (CC_HAS_FAST_MATH)
         add_definitions(-ffast-math)
     endif()
-    check_cxx_compiler_flag(-mstackrealign CC_HAS_STACK_REALIGN) 
-    if (CC_HAS_STACK_REALIGN)
-        add_definitions(-mstackrealign)
+    if (NOT (ARM64 OR CROSS_COMPILE_ARM64))
+        check_cxx_compiler_flag(-mstackrealign CC_HAS_STACK_REALIGN)
+        if (CC_HAS_STACK_REALIGN)
+            add_definitions(-mstackrealign)
+        endif()
     endif()
     # Disable exceptions. Reduce executable size, increase compability.
     check_cxx_compiler_flag(-fno-exceptions CC_HAS_FNO_EXCEPTIONS_FLAG)
@@ -558,6 +642,21 @@
     add_definitions(-DDETAILED_CU_STATS)
 endif(DETAILED_CU_STATS)
 
+option(ENABLE_ALPHA "Enable alpha encoding in x265" OFF)
+if(ENABLE_ALPHA)
+    add_definitions(-DENABLE_ALPHA)
+endif()
+
+option(ENABLE_MULTIVIEW "Enable Multi-view encoding in HEVC" OFF)
+if(ENABLE_MULTIVIEW)
+    add_definitions(-DENABLE_MULTIVIEW)
+endif()
+
+option(ENABLE_SCC_EXT "Enable screen content coding extension in HEVC" OFF)
+if(ENABLE_SCC_EXT)

 
@@ -22,6 +22,8 @@
 include(CheckFunctionExists)
 include(CheckSymbolExists)
 include(CheckCXXCompilerFlag)
+include(CheckCSourceCompiles)
+include(CheckCXXSourceCompiles)
 
 option(FPROFILE_GENERATE "Compile executable to generate usage data" OFF)
 option(FPROFILE_USE "Compile executable using generated usage data" OFF)
@@ -29,7 +31,7 @@
 option(STATIC_LINK_CRT "Statically link C runtime for release builds" OFF)
 mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD)
 # X265_BUILD must be incremented each time the public API is changed
-set(X265_BUILD 209)
+set(X265_BUILD 212)
 configure_file("${PROJECT_SOURCE_DIR}/x265.def.in"
                "${PROJECT_BINARY_DIR}/x265.def")
 configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in"
@@ -80,14 +82,16 @@
     set(ARM 1)
     add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=1)
 elseif(ARM64MATCH GREATER "-1")
-    #if(CROSS_COMPILE_ARM64)
-        #message(STATUS "Cross compiling for ARM64 arch")
-    #else()
-        #set(CROSS_COMPILE_ARM64 0)
-    #endif()
     message(STATUS "Detected ARM64 target processor")
     set(ARM64 1)
-    add_definitions(-DX265_ARCH_ARM64=1 -DHAVE_NEON)
+
+    option(AARCH64_WARNINGS_AS_ERRORS "Build with -Werror for AArch64 Intrinsics files" OFF)
+
+    # Options for cross compiling AArch64 optional extensions
+    option(CROSS_COMPILE_SVE "Cross Compile for SVE Target" OFF)
+    option(CROSS_COMPILE_SVE2 "Cross Compile for SVE2 Target" OFF)
+    option(CROSS_COMPILE_NEON_DOTPROD "Cross Compile for Neon DotProd Target" OFF)
+    option(CROSS_COMPILE_NEON_I8MM "Cross Compile for Neon I8MM Target" OFF)
 else()
     message(STATUS "CMAKE_SYSTEM_PROCESSOR value `${CMAKE_SYSTEM_PROCESSOR}` is unknown")
     message(STATUS "Please add this value near ${CMAKE_CURRENT_LIST_FILE}:${CMAKE_CURRENT_LIST_LINE}")
@@ -259,28 +263,106 @@
             set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=vfp -marm)
         endif()
     endif()
-   if(ARM64 OR CROSS_COMPILE_ARM64)
-        find_package(Neon)
-        find_package(SVE)
-        find_package(SVE2)
-        if(CPU_HAS_SVE2 OR CROSS_COMPILE_SVE2)
-            message(STATUS "Found SVE2")
-           set(ARM_ARGS -O3 -march=armv8-a+sve2 -fPIC -flax-vector-conversions)
-            add_definitions(-DHAVE_SVE2)
-            add_definitions(-DHAVE_SVE)
-            add_definitions(-DHAVE_NEON) # for NEON c/c++ primitives, as currently there is no implementation that use SVE2
-        elseif(CPU_HAS_SVE OR CROSS_COMPILE_SVE)
-            message(STATUS "Found SVE")
-           set(ARM_ARGS -O3 -march=armv8-a+sve -fPIC -flax-vector-conversions)
-            add_definitions(-DHAVE_SVE)
-            add_definitions(-DHAVE_NEON) # for NEON c/c++ primitives, as currently there is no implementation that use SVE
-        elseif(CPU_HAS_NEON)
-            message(STATUS "Found NEON")
-            set(ARM_ARGS -fPIC -flax-vector-conversions)
-            add_definitions(-DHAVE_NEON)
+    if(ARM64)
+        message(STATUS "Found Neon")
+        set(CPU_HAS_NEON 1)
+        add_definitions(-DX265_ARCH_ARM64=1 -DHAVE_NEON=1)
+
+        if(CROSS_COMPILE_ARM64)
+            # Handle cross-compilation options.
+            if(CROSS_COMPILE_NEON_DOTPROD)
+                set(CPU_HAS_NEON_DOTPROD 1)
+            endif()
+            if(CROSS_COMPILE_NEON_I8MM)
+                set(CPU_HAS_NEON_I8MM 1)
+                # Impose the constraint that Neon I8MM implies Neon DotProd.
+                set(CPU_HAS_NEON_DOTPROD 1)
+            endif()
+            if(CROSS_COMPILE_SVE)
+                set(CPU_HAS_SVE 1)
+                # Impose the constraint that SVE implies Neon DotProd and I8MM.
+                set(CPU_HAS_NEON_DOTPROD 1)
+                set(CPU_HAS_NEON_I8MM 1)
+            endif()
+            if(CROSS_COMPILE_SVE2)
+                set(CPU_HAS_SVE2 1)
+                # SVE2 implies SVE and Neon DotProd.
+                set(CPU_HAS_SVE 1)
+                set(CPU_HAS_NEON_DOTPROD 1)
+                # Impose the constraint that SVE2 implies Neon I8MM.
+                set(CPU_HAS_NEON_I8MM 1)
+            endif()
         else()
-            set(ARM_ARGS -fPIC -flax-vector-conversions)
-        endif()        
+            if(CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin")
+                find_package(NEON_DOTPROD)
+                find_package(NEON_I8MM)
+                find_package(SVE)
+                find_package(SVE2)
+            else()
+                message(STATUS "Compile time feature detection unsupported on this platform")
+            endif()
+        endif()
+
+        if(CPU_HAS_NEON_DOTPROD)
+            # Neon DotProd is mandatory from Armv8.4.
+            message(STATUS "Found Neon DotProd")
+            set(ARM_ARGS -O3 -march=armv8.2-a+dotprod)
+            add_definitions(-DHAVE_NEON_DOTPROD=1)
+        endif()
+        if(CPU_HAS_NEON_I8MM)
+            # Neon I8MM is mandatory from Armv8.6.
+            message(STATUS "Found Neon I8MM")
+            # Impose the constraint that Neon I8MM implies Neon DotProd.
+            if(NOT CPU_HAS_NEON_DOTPROD)
+                message(FATAL_ERROR "Unsupported AArch64 feature combination (Neon I8MM without Neon DotProd)")
+            endif()
+            set(ARM_ARGS -O3 -march=armv8.2-a+dotprod+i8mm)
+            add_definitions(-DHAVE_NEON_I8MM=1)
+        endif()
+        if(CPU_HAS_SVE)
+            message(STATUS "Found SVE")
+            # Impose the constraint that SVE implies Neon I8MM.
+            if(NOT CPU_HAS_NEON_I8MM)
+                message(FATAL_ERROR "Unsupported AArch64 feature combination (SVE without Neon I8MM)")
+            endif()
+            set(ARM_ARGS -O3 -march=armv8.2-a+dotprod+i8mm+sve)
+            add_definitions(-DHAVE_SVE=1)
+        endif()
+        if(CPU_HAS_SVE2)
+            message(STATUS "Found SVE2")
+            # SVE2 is only available from Armv9.0, and armv9-a implies +dotprod
+            set(ARM_ARGS -O3 -march=armv9-a+i8mm+sve2)
+            add_definitions(-DHAVE_SVE2=1)
+        endif()
+        set(ARM_ARGS ${ARM_ARGS} -fPIC)
+        # Do not allow implicit vector type conversions in Clang builds (this
+        # is already the default in GCC builds).
+        check_cxx_compiler_flag(-flax-vector-conversions=none CC_HAS_FLAX_VEC_CONV_NONE)
+        if(CC_HAS_FLAX_VEC_CONV_NONE)
+            set(ARM_ARGS ${ARM_ARGS} -flax-vector-conversions=none)
+        endif()
+        if(CPU_HAS_SVE)
+            set(SVE_HEADER_TEST "
+#ifndef __ARM_NEON_SVE_BRIDGE
+#error 1
+#endif
+#include <arm_sve.h>
+#include <arm_neon_sve_bridge.h>
+int main() { return 0; }")
+            set(OLD_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS})
+            # CMAKE_REQUIRED_FLAGS requires a space-delimited string, whereas
+            # ARM_ARGS is defined and used elsewhere as a ;-list.
+            foreach(ARM_ARG ${ARM_ARGS})
+                string(APPEND CMAKE_REQUIRED_FLAGS " ${ARM_ARG}")
+            endforeach()
+            check_c_source_compiles("${SVE_HEADER_TEST}" SVE_HEADER_C_TEST_COMPILED)
+            check_cxx_source_compiles("${SVE_HEADER_TEST}" SVE_HEADER_CXX_TEST_COMPILED)
+            set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQUIRED_FLAGS})
+            if(SVE_HEADER_C_TEST_COMPILED AND SVE_HEADER_CXX_TEST_COMPILED)
+                add_definitions(-DHAVE_SVE_BRIDGE=1)
+                set(HAVE_SVE_BRIDGE 1)
+            endif()
+        endif()
     endif()
    if(ENABLE_PIC)
    list(APPEND ARM_ARGS -DPIC)
@@ -334,9 +416,11 @@
     if (CC_HAS_FAST_MATH)
         add_definitions(-ffast-math)
     endif()
-    check_cxx_compiler_flag(-mstackrealign CC_HAS_STACK_REALIGN) 
-    if (CC_HAS_STACK_REALIGN)
-        add_definitions(-mstackrealign)
+    if (NOT (ARM64 OR CROSS_COMPILE_ARM64))
+        check_cxx_compiler_flag(-mstackrealign CC_HAS_STACK_REALIGN)
+        if (CC_HAS_STACK_REALIGN)
+            add_definitions(-mstackrealign)
+        endif()
     endif()
     # Disable exceptions. Reduce executable size, increase compability.
     check_cxx_compiler_flag(-fno-exceptions CC_HAS_FNO_EXCEPTIONS_FLAG)
@@ -558,6 +642,21 @@
     add_definitions(-DDETAILED_CU_STATS)
 endif(DETAILED_CU_STATS)
 
+option(ENABLE_ALPHA "Enable alpha encoding in x265" OFF)
+if(ENABLE_ALPHA)
+    add_definitions(-DENABLE_ALPHA)
+endif()
+
+option(ENABLE_MULTIVIEW "Enable Multi-view encoding in HEVC" OFF)
+if(ENABLE_MULTIVIEW)
+    add_definitions(-DENABLE_MULTIVIEW)
+endif()
+
+option(ENABLE_SCC_EXT "Enable screen content coding extension in HEVC" OFF)
+if(ENABLE_SCC_EXT)
​

x265_3.6.tar.gz/source/abrEncApp.cpp -> x265_4.0.tar.gz/source/abrEncApp.cpp Changed

@@ -63,6 +63,7 @@
             m_passEnci->init(ret);
         }
 
+        m_numInputViews = m_passEnc0->m_param->numViews;
         if (!allocBuffers())
         {
             x265_log(NULL, X265_LOG_ERROR, "Unable to allocate memory for buffers\n");
@@ -76,7 +77,11 @@
 
     bool AbrEncoder::allocBuffers()
     {
+#if ENABLE_MULTIVIEW
+        m_inputPicBuffer = X265_MALLOC(x265_picture**, MAX_VIEWS);
+#else
         m_inputPicBuffer = X265_MALLOC(x265_picture**, m_numEncodes);
+#endif
         m_analysisBuffer = X265_MALLOC(x265_analysis_data*, m_numEncodes);
 
         m_picWriteCnt = new ThreadSafeIntegerm_numEncodes;
@@ -89,21 +94,48 @@
         m_analysisRead = X265_MALLOC(ThreadSafeInteger*, m_numEncodes);
         m_readFlag = X265_MALLOC(int*, m_numEncodes);
 
-        for (uint8_t pass = 0; pass < m_numEncodes; pass++)
+#if ENABLE_MULTIVIEW
+        if (m_passEnc0->m_param->numViews > 1)
         {
-            m_inputPicBufferpass = X265_MALLOC(x265_picture*, m_queueSize);
-            for (uint32_t idx = 0; idx < m_queueSize; idx++)
+            for (uint8_t pass = 0; pass < m_numInputViews; pass++)
             {
-                m_inputPicBufferpassidx = x265_picture_alloc();
-                x265_picture_init(m_passEncpass->m_param, m_inputPicBufferpassidx);
+                m_inputPicBufferpass = X265_MALLOC(x265_picture*, m_queueSize);
+                for (uint32_t idx = 0; idx < m_queueSize; idx++)
+                {
+                    m_inputPicBufferpassidx = x265_picture_alloc();
+                    x265_picture_init(m_passEnc0->m_param, m_inputPicBufferpassidx);
+                }
+                if (pass == 0)
+                {
+                    CHECKED_MALLOC_ZERO(m_analysisBufferpass, x265_analysis_data, m_queueSize);
+                    m_picIdxReadCntpass = new ThreadSafeIntegerm_queueSize;
+                    m_analysisWritepass = new ThreadSafeIntegerm_queueSize;
+                    m_analysisReadpass = new ThreadSafeIntegerm_queueSize;
+                    m_readFlagpass = X265_MALLOC(int, m_queueSize);
+                }
             }
+        }
+        else
+        {
+#endif
+            for (uint8_t pass = 0; pass < m_numEncodes; pass++)
+            {
+                m_inputPicBufferpass = X265_MALLOC(x265_picture*, m_queueSize);
+                for (uint32_t idx = 0; idx < m_queueSize; idx++)
+                {
+                    m_inputPicBufferpassidx = x265_picture_alloc();
+                    x265_picture_init(m_passEncpass->m_param, m_inputPicBufferpassidx);
+                }
 
-            CHECKED_MALLOC_ZERO(m_analysisBufferpass, x265_analysis_data, m_queueSize);
-            m_picIdxReadCntpass = new ThreadSafeIntegerm_queueSize;
-            m_analysisWritepass = new ThreadSafeIntegerm_queueSize;
-            m_analysisReadpass = new ThreadSafeIntegerm_queueSize;
-            m_readFlagpass = X265_MALLOC(int, m_queueSize);
+                CHECKED_MALLOC_ZERO(m_analysisBufferpass, x265_analysis_data, m_queueSize);
+                m_picIdxReadCntpass = new ThreadSafeIntegerm_queueSize;
+                m_analysisWritepass = new ThreadSafeIntegerm_queueSize;
+                m_analysisReadpass = new ThreadSafeIntegerm_queueSize;
+                m_readFlagpass = X265_MALLOC(int, m_queueSize);
+            }
+#if ENABLE_MULTIVIEW
         }
+#endif
         return true;
     fail:
         return false;
@@ -112,15 +144,37 @@
     void AbrEncoder::destroy()
     {
         x265_cleanup(); /* Free library singletons */
-        for (uint8_t pass = 0; pass < m_numEncodes; pass++)
+#if ENABLE_MULTIVIEW
+        for (uint8_t pass = 0; pass < m_numInputViews; pass++)
         {
             for (uint32_t index = 0; index < m_queueSize; index++)
             {
                 X265_FREE(m_inputPicBufferpassindex->planes0);
                 x265_picture_free(m_inputPicBufferpassindex);
             }
+            X265_FREE(m_inputPicBufferpass);
 
+            if (pass == 0)
+            {
+                X265_FREE(m_analysisBufferpass);
+                X265_FREE(m_readFlagpass);
+                delete m_picIdxReadCntpass;
+                delete m_analysisWritepass;
+                delete m_analysisReadpass;
+                m_passEncpass->destroy();
+                delete m_passEncpass;
+            }
+        }
+#else
+        for (uint8_t pass = 0; pass < m_numEncodes; pass++)
+        {
+            for (uint32_t index = 0; index < m_queueSize; index++)
+            {
+                X265_FREE(m_inputPicBufferpassindex->planes0);
+                x265_picture_free(m_inputPicBufferpassindex);
+            }
             X265_FREE(m_inputPicBufferpass);
+
             X265_FREE(m_analysisBufferpass);
             X265_FREE(m_readFlagpass);
             delete m_picIdxReadCntpass;
@@ -129,6 +183,7 @@
             m_passEncpass->destroy();
             delete m_passEncpass;
         }
+#endif
         X265_FREE(m_inputPicBuffer);
         X265_FREE(m_analysisBuffer);
         X265_FREE(m_readFlag);
@@ -150,8 +205,11 @@
         m_id = id;
         m_cliopt = cliopt;
         m_parent = parent;
-        if(!(m_cliopt.enableScaler && m_id))
-            m_input = m_cliopt.input;
+        if (!(m_cliopt.enableScaler && m_id))
+        {
+            for (int view = 0; view < m_cliopt.param->numViews; view++)
+                m_inputview = m_cliopt.inputview;
+        }
         m_param = cliopt.param;
         m_inputOver = false;
         m_lastIdx = -1;
@@ -206,6 +264,7 @@
         {
             x265_log(NULL, X265_LOG_ERROR, "x265_encoder_open() failed for Enc, \n");
             m_ret = 2;
+            m_reader = NULL;
             return -1;
         }
 
@@ -402,7 +461,7 @@
     }
 
 
-    bool PassEncoder::readPicture(x265_picture *dstPic)
+    bool PassEncoder::readPicture(x265_picture* dstPic, int view)
     {
         /*Check and wait if there any input frames to read*/
         int ipread = m_parent->m_picReadCntm_id.get();
@@ -480,7 +539,7 @@
             }
 
 
-            x265_picture *srcPic = (x265_picture*)(m_parent->m_inputPicBufferm_idreadPos);
+            x265_picture* srcPic = (m_param->numViews > 1) ? (x265_picture*)(m_parent->m_inputPicBufferviewreadPos) : (x265_picture*)(m_parent->m_inputPicBufferm_idreadPos);
 
             x265_picture *pic = (x265_picture*)(dstPic);
             pic->colorSpace = srcPic->colorSpace;
@@ -499,6 +558,8 @@
             pic->planes0 = srcPic->planes0;
             pic->planes1 = srcPic->planes1;
             pic->planes2 = srcPic->planes2;
+            pic->planes3 = srcPic->planes3;
+            pic->format = srcPic->format;
             if (isAbrLoad)
                 pic->analysisData = *analysisData;
             return true;
@@ -529,11 +590,17 @@
                 x265_log(m_param, X265_LOG_ERROR, "Unable to register CTRL+C handler: %s in %s\n",
                     strerror(errno), profileName);
 
-            x265_picture pic_orig, pic_out;
-            x265_picture *pic_in = &pic_orig;
+            x265_picture pic_origMAX_VIEWS;
+            x265_picture *pic_inMAX_VIEWS;
+            for (int view = 0; view < m_param->numViews; view++)
+                pic_inview = &pic_origview;
             /* Allocate recon picture if analysis save/load is enabled */
             std::priority_queue<int64_t>* pts_queue = m_cliopt.output->needPTS() ? new std::priority_queue<int64_t>() : NULL;
-            x265_picture *pic_recon = (m_cliopt.recon || m_param->analysisSave || m_param->analysisLoad || pts_queue || reconPlay || m_param->csvLogLevel) ? &pic_out : NULL;
+            x265_picture* pic_reconMAX_LAYERS;
+            x265_picture pic_outMAX_LAYERS;
+
+            for (int i = 0; i < m_param->numLayers; i++)
+                pic_reconi = (m_cliopt.reconi || m_param->analysisSave || m_param->analysisLoad || pts_queue || reconPlay || m_param->csvLogLevel) ? &pic_outi : NULL;
             uint32_t inFrameCount = 0;
             uint32_t outFrameCount = 0;
             x265_nal *p_nal;
@@ -544,7 +611,7 @@
             uint8_t *rpuPayload = NULL;
             int inputPicNum = 1;
             x265_picture picField1, picField2;

 
@@ -63,6 +63,7 @@
             m_passEnci->init(ret);
         }
 
+        m_numInputViews = m_passEnc0->m_param->numViews;
         if (!allocBuffers())
         {
             x265_log(NULL, X265_LOG_ERROR, "Unable to allocate memory for buffers\n");
@@ -76,7 +77,11 @@
 
     bool AbrEncoder::allocBuffers()
     {
+#if ENABLE_MULTIVIEW
+        m_inputPicBuffer = X265_MALLOC(x265_picture**, MAX_VIEWS);
+#else
         m_inputPicBuffer = X265_MALLOC(x265_picture**, m_numEncodes);
+#endif
         m_analysisBuffer = X265_MALLOC(x265_analysis_data*, m_numEncodes);
 
         m_picWriteCnt = new ThreadSafeIntegerm_numEncodes;
@@ -89,21 +94,48 @@
         m_analysisRead = X265_MALLOC(ThreadSafeInteger*, m_numEncodes);
         m_readFlag = X265_MALLOC(int*, m_numEncodes);
 
-        for (uint8_t pass = 0; pass < m_numEncodes; pass++)
+#if ENABLE_MULTIVIEW
+        if (m_passEnc0->m_param->numViews > 1)
         {
-            m_inputPicBufferpass = X265_MALLOC(x265_picture*, m_queueSize);
-            for (uint32_t idx = 0; idx < m_queueSize; idx++)
+            for (uint8_t pass = 0; pass < m_numInputViews; pass++)
             {
-                m_inputPicBufferpassidx = x265_picture_alloc();
-                x265_picture_init(m_passEncpass->m_param, m_inputPicBufferpassidx);
+                m_inputPicBufferpass = X265_MALLOC(x265_picture*, m_queueSize);
+                for (uint32_t idx = 0; idx < m_queueSize; idx++)
+                {
+                    m_inputPicBufferpassidx = x265_picture_alloc();
+                    x265_picture_init(m_passEnc0->m_param, m_inputPicBufferpassidx);
+                }
+                if (pass == 0)
+                {
+                    CHECKED_MALLOC_ZERO(m_analysisBufferpass, x265_analysis_data, m_queueSize);
+                    m_picIdxReadCntpass = new ThreadSafeIntegerm_queueSize;
+                    m_analysisWritepass = new ThreadSafeIntegerm_queueSize;
+                    m_analysisReadpass = new ThreadSafeIntegerm_queueSize;
+                    m_readFlagpass = X265_MALLOC(int, m_queueSize);
+                }
             }
+        }
+        else
+        {
+#endif
+            for (uint8_t pass = 0; pass < m_numEncodes; pass++)
+            {
+                m_inputPicBufferpass = X265_MALLOC(x265_picture*, m_queueSize);
+                for (uint32_t idx = 0; idx < m_queueSize; idx++)
+                {
+                    m_inputPicBufferpassidx = x265_picture_alloc();
+                    x265_picture_init(m_passEncpass->m_param, m_inputPicBufferpassidx);
+                }
 
-            CHECKED_MALLOC_ZERO(m_analysisBufferpass, x265_analysis_data, m_queueSize);
-            m_picIdxReadCntpass = new ThreadSafeIntegerm_queueSize;
-            m_analysisWritepass = new ThreadSafeIntegerm_queueSize;
-            m_analysisReadpass = new ThreadSafeIntegerm_queueSize;
-            m_readFlagpass = X265_MALLOC(int, m_queueSize);
+                CHECKED_MALLOC_ZERO(m_analysisBufferpass, x265_analysis_data, m_queueSize);
+                m_picIdxReadCntpass = new ThreadSafeIntegerm_queueSize;
+                m_analysisWritepass = new ThreadSafeIntegerm_queueSize;
+                m_analysisReadpass = new ThreadSafeIntegerm_queueSize;
+                m_readFlagpass = X265_MALLOC(int, m_queueSize);
+            }
+#if ENABLE_MULTIVIEW
         }
+#endif
         return true;
     fail:
         return false;
@@ -112,15 +144,37 @@
     void AbrEncoder::destroy()
     {
         x265_cleanup(); /* Free library singletons */
-        for (uint8_t pass = 0; pass < m_numEncodes; pass++)
+#if ENABLE_MULTIVIEW
+        for (uint8_t pass = 0; pass < m_numInputViews; pass++)
         {
             for (uint32_t index = 0; index < m_queueSize; index++)
             {
                 X265_FREE(m_inputPicBufferpassindex->planes0);
                 x265_picture_free(m_inputPicBufferpassindex);
             }
+            X265_FREE(m_inputPicBufferpass);
 
+            if (pass == 0)
+            {
+                X265_FREE(m_analysisBufferpass);
+                X265_FREE(m_readFlagpass);
+                delete m_picIdxReadCntpass;
+                delete m_analysisWritepass;
+                delete m_analysisReadpass;
+                m_passEncpass->destroy();
+                delete m_passEncpass;
+            }
+        }
+#else
+        for (uint8_t pass = 0; pass < m_numEncodes; pass++)
+        {
+            for (uint32_t index = 0; index < m_queueSize; index++)
+            {
+                X265_FREE(m_inputPicBufferpassindex->planes0);
+                x265_picture_free(m_inputPicBufferpassindex);
+            }
             X265_FREE(m_inputPicBufferpass);
+
             X265_FREE(m_analysisBufferpass);
             X265_FREE(m_readFlagpass);
             delete m_picIdxReadCntpass;
@@ -129,6 +183,7 @@
             m_passEncpass->destroy();
             delete m_passEncpass;
         }
+#endif
         X265_FREE(m_inputPicBuffer);
         X265_FREE(m_analysisBuffer);
         X265_FREE(m_readFlag);
@@ -150,8 +205,11 @@
         m_id = id;
         m_cliopt = cliopt;
         m_parent = parent;
-        if(!(m_cliopt.enableScaler && m_id))
-            m_input = m_cliopt.input;
+        if (!(m_cliopt.enableScaler && m_id))
+        {
+            for (int view = 0; view < m_cliopt.param->numViews; view++)
+                m_inputview = m_cliopt.inputview;
+        }
         m_param = cliopt.param;
         m_inputOver = false;
         m_lastIdx = -1;
@@ -206,6 +264,7 @@
         {
             x265_log(NULL, X265_LOG_ERROR, "x265_encoder_open() failed for Enc, \n");
             m_ret = 2;
+            m_reader = NULL;
             return -1;
         }
 
@@ -402,7 +461,7 @@
     }
 
 
-    bool PassEncoder::readPicture(x265_picture *dstPic)
+    bool PassEncoder::readPicture(x265_picture* dstPic, int view)
     {
         /*Check and wait if there any input frames to read*/
         int ipread = m_parent->m_picReadCntm_id.get();
@@ -480,7 +539,7 @@
             }
 
 
-            x265_picture *srcPic = (x265_picture*)(m_parent->m_inputPicBufferm_idreadPos);
+            x265_picture* srcPic = (m_param->numViews > 1) ? (x265_picture*)(m_parent->m_inputPicBufferviewreadPos) : (x265_picture*)(m_parent->m_inputPicBufferm_idreadPos);
 
             x265_picture *pic = (x265_picture*)(dstPic);
             pic->colorSpace = srcPic->colorSpace;
@@ -499,6 +558,8 @@
             pic->planes0 = srcPic->planes0;
             pic->planes1 = srcPic->planes1;
             pic->planes2 = srcPic->planes2;
+            pic->planes3 = srcPic->planes3;
+            pic->format = srcPic->format;
             if (isAbrLoad)
                 pic->analysisData = *analysisData;
             return true;
@@ -529,11 +590,17 @@
                 x265_log(m_param, X265_LOG_ERROR, "Unable to register CTRL+C handler: %s in %s\n",
                     strerror(errno), profileName);
 
-            x265_picture pic_orig, pic_out;
-            x265_picture *pic_in = &pic_orig;
+            x265_picture pic_origMAX_VIEWS;
+            x265_picture *pic_inMAX_VIEWS;
+            for (int view = 0; view < m_param->numViews; view++)
+                pic_inview = &pic_origview;
             /* Allocate recon picture if analysis save/load is enabled */
             std::priority_queue<int64_t>* pts_queue = m_cliopt.output->needPTS() ? new std::priority_queue<int64_t>() : NULL;
-            x265_picture *pic_recon = (m_cliopt.recon || m_param->analysisSave || m_param->analysisLoad || pts_queue || reconPlay || m_param->csvLogLevel) ? &pic_out : NULL;
+            x265_picture* pic_reconMAX_LAYERS;
+            x265_picture pic_outMAX_LAYERS;
+
+            for (int i = 0; i < m_param->numLayers; i++)
+                pic_reconi = (m_cliopt.reconi || m_param->analysisSave || m_param->analysisLoad || pts_queue || reconPlay || m_param->csvLogLevel) ? &pic_outi : NULL;
             uint32_t inFrameCount = 0;
             uint32_t outFrameCount = 0;
             x265_nal *p_nal;
@@ -544,7 +611,7 @@
             uint8_t *rpuPayload = NULL;
             int inputPicNum = 1;
             x265_picture picField1, picField2;
​

x265_3.6.tar.gz/source/abrEncApp.h -> x265_4.0.tar.gz/source/abrEncApp.h Changed

@@ -42,6 +42,7 @@
     {
     public:
         uint8_t           m_numEncodes;
+        uint8_t           m_numInputViews; // Number of inputs for multiview-extension
         PassEncoder        **m_passEnc;
         uint32_t           m_queueSize;
         ThreadSafeInteger  m_numActiveEncodes;
@@ -86,7 +87,7 @@
         x265_picture **m_outputRecon;
 
         CLIOptions m_cliopt;
-        InputFile* m_input;
+        InputFile* m_inputMAX_VIEWS;
         const char* m_reconPlayCmd;
         FILE*    m_qpfile;
         FILE*    m_zoneFile;
@@ -102,7 +103,7 @@
         void startThreads();
         void copyInfo(x265_analysis_data *src);
 
-        bool readPicture(x265_picture*);
+        bool readPicture(x265_picture*, int view);
         void destroy();
 
     private:
@@ -142,7 +143,7 @@
     public:
         PassEncoder *m_parentEnc;
         int m_id;
-        InputFile* m_input;
+        InputFile* m_inputMAX_VIEWS;
         int m_threadActive;
 
         Reader(int id, PassEncoder *parentEnc);

 
@@ -42,6 +42,7 @@
     {
     public:
         uint8_t           m_numEncodes;
+        uint8_t           m_numInputViews; // Number of inputs for multiview-extension
         PassEncoder        **m_passEnc;
         uint32_t           m_queueSize;
         ThreadSafeInteger  m_numActiveEncodes;
@@ -86,7 +87,7 @@
         x265_picture **m_outputRecon;
 
         CLIOptions m_cliopt;
-        InputFile* m_input;
+        InputFile* m_inputMAX_VIEWS;
         const char* m_reconPlayCmd;
         FILE*    m_qpfile;
         FILE*    m_zoneFile;
@@ -102,7 +103,7 @@
         void startThreads();
         void copyInfo(x265_analysis_data *src);
 
-        bool readPicture(x265_picture*);
+        bool readPicture(x265_picture*, int view);
         void destroy();
 
     private:
@@ -142,7 +143,7 @@
     public:
         PassEncoder *m_parentEnc;
         int m_id;
-        InputFile* m_input;
+        InputFile* m_inputMAX_VIEWS;
         int m_threadActive;
 
         Reader(int id, PassEncoder *parentEnc);
​

x265_4.0.tar.gz/source/cmake/FindNEON_DOTPROD.cmake Added

 
@@ -0,0 +1,21 @@
+include(FindPackageHandleStandardArgs)
+
+# Check if Armv8.4 Neon DotProd is supported by the Arm CPU
+if(APPLE)
+    execute_process(COMMAND sysctl -a
+                    COMMAND grep "hw.optional.arm.FEAT_DotProd: 1"
+                    OUTPUT_VARIABLE has_dot_product
+                    ERROR_QUIET
+                    OUTPUT_STRIP_TRAILING_WHITESPACE)
+else()
+    execute_process(COMMAND cat /proc/cpuinfo
+                    COMMAND grep Features
+                    COMMAND grep asimddp
+                    OUTPUT_VARIABLE has_dot_product
+                    ERROR_QUIET
+                    OUTPUT_STRIP_TRAILING_WHITESPACE)
+endif()
+
+if(has_dot_product)
+    set(CPU_HAS_NEON_DOTPROD 1)
+endif()
​

x265_4.0.tar.gz/source/cmake/FindNEON_I8MM.cmake Added

 
@@ -0,0 +1,21 @@
+include(FindPackageHandleStandardArgs)
+
+# Check if Armv8.6 Neon I8MM is supported by the Arm CPU
+if(APPLE)
+    execute_process(COMMAND sysctl -a
+                    COMMAND grep "hw.optional.arm.FEAT_I8MM: 1"
+                    OUTPUT_VARIABLE has_i8mm
+                    ERROR_QUIET
+                    OUTPUT_STRIP_TRAILING_WHITESPACE)
+else()
+    execute_process(COMMAND cat /proc/cpuinfo
+                    COMMAND grep Features
+                    COMMAND grep i8mm
+                    OUTPUT_VARIABLE has_i8mm
+                    ERROR_QUIET
+                    OUTPUT_STRIP_TRAILING_WHITESPACE)
+endif()
+
+if(has_i8mm)
+    set(CPU_HAS_NEON_I8MM 1)
+endif()
​

x265_3.6.tar.gz/source/common/CMakeLists.txt -> x265_4.0.tar.gz/source/common/CMakeLists.txt Changed

@@ -103,22 +103,57 @@
         add_definitions(-DAUTO_VECTORIZE=1)
     endif()
 
-    set(C_SRCS asm-primitives.cpp pixel-prim.h pixel-prim.cpp filter-prim.h filter-prim.cpp dct-prim.h dct-prim.cpp loopfilter-prim.cpp loopfilter-prim.h intrapred-prim.cpp arm64-utils.cpp arm64-utils.h fun-decls.h)
+    set(C_SRCS_NEON asm-primitives.cpp pixel-prim.h pixel-prim.cpp filter-prim.h filter-prim.cpp dct-prim.h dct-prim.cpp loopfilter-prim.cpp loopfilter-prim.h intrapred-prim.cpp arm64-utils.cpp arm64-utils.h fun-decls.h sao-prim.cpp  mem-neon.h)
+    set(C_SRCS_NEON_DOTPROD filter-neon-dotprod.cpp)
+    set(C_SRCS_NEON_I8MM filter-neon-i8mm.cpp)
+    set(C_SRCS_SVE sao-prim-sve.cpp dct-prim-sve.cpp)
+    set(C_SRCS_SVE2 sao-prim-sve2.cpp)
     enable_language(ASM)
 
     # add ARM assembly/intrinsic files here
-    set(A_SRCS asm.S mc-a.S mc-a-common.S sad-a.S sad-a-common.S pixel-util.S pixel-util-common.S p2s.S p2s-common.S ipfilter.S ipfilter-common.S blockcopy8.S blockcopy8-common.S ssd-a.S ssd-a-common.S)
-    set(A_SRCS_SVE asm-sve.S blockcopy8-sve.S p2s-sve.S pixel-util-sve.S ssd-a-sve.S)
-    set(A_SRCS_SVE2 mc-a-sve2.S sad-a-sve2.S pixel-util-sve2.S ipfilter-sve2.S ssd-a-sve2.S)
+    set(A_SRCS asm.S mc-a.S mc-a-common.S sad-a.S pixel-util.S pixel-util-common.S p2s.S p2s-common.S blockcopy8.S blockcopy8-common.S ssd-a.S ssd-a-common.S intrapred.S dct.S)
+    set(A_SRCS_SVE asm-sve.S blockcopy8-sve.S p2s-sve.S pixel-util-sve.S)
+    set(A_SRCS_SVE2 mc-a-sve2.S pixel-util-sve2.S ssd-a-sve2.S)
+    set(A_SRCS_NEON_DOTPROD sad-neon-dotprod.S ssd-neon-dotprod.S)
     set(VEC_PRIMITIVES)
 
     set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
     set(ARM_ASMS_SVE "${A_SRCS_SVE}" CACHE INTERNAL "ARM Assembly Sources that use SVE instruction set")
     set(ARM_ASMS_SVE2 "${A_SRCS_SVE2}" CACHE INTERNAL "ARM Assembly Sources that use SVE2 instruction set")
-    foreach(SRC ${C_SRCS})
+    set(ARM_ASMS_NEON_DOTPROD "${A_SRCS_NEON_DOTPROD}" CACHE INTERNAL "Arm Assembly Sources that use the Neon DotProd extension")
+    foreach(SRC ${C_SRCS_NEON})
         set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
     endforeach()
+
+    if(CPU_HAS_NEON_I8MM)
+        foreach(SRC ${C_SRCS_NEON_I8MM})
+            set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
+        endforeach()
+    endif()
+
+    if(CPU_HAS_NEON_DOTPROD)
+        foreach(SRC ${C_SRCS_NEON_DOTPROD})
+            set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
+        endforeach()
+    endif()
+
+    if(CPU_HAS_SVE AND HAVE_SVE_BRIDGE)
+        foreach(SRC ${C_SRCS_SVE})
+            set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
+        endforeach()
+    endif()
+
+    if(CPU_HAS_SVE2 AND HAVE_SVE_BRIDGE)
+        foreach(SRC ${C_SRCS_SVE2})
+            set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
+        endforeach()
+    endif()
+
     source_group(Assembly FILES ${ASM_PRIMITIVES})
+
+    if(AARCH64_WARNINGS_AS_ERRORS)
+        set_source_files_properties(${ASM_PRIMITIVES} PROPERTIES COMPILE_FLAGS -Werror)
+    endif()
 endif(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64))
 
 if(POWER)

 
@@ -103,22 +103,57 @@
         add_definitions(-DAUTO_VECTORIZE=1)
     endif()
 
-    set(C_SRCS asm-primitives.cpp pixel-prim.h pixel-prim.cpp filter-prim.h filter-prim.cpp dct-prim.h dct-prim.cpp loopfilter-prim.cpp loopfilter-prim.h intrapred-prim.cpp arm64-utils.cpp arm64-utils.h fun-decls.h)
+    set(C_SRCS_NEON asm-primitives.cpp pixel-prim.h pixel-prim.cpp filter-prim.h filter-prim.cpp dct-prim.h dct-prim.cpp loopfilter-prim.cpp loopfilter-prim.h intrapred-prim.cpp arm64-utils.cpp arm64-utils.h fun-decls.h sao-prim.cpp  mem-neon.h)
+    set(C_SRCS_NEON_DOTPROD filter-neon-dotprod.cpp)
+    set(C_SRCS_NEON_I8MM filter-neon-i8mm.cpp)
+    set(C_SRCS_SVE sao-prim-sve.cpp dct-prim-sve.cpp)
+    set(C_SRCS_SVE2 sao-prim-sve2.cpp)
     enable_language(ASM)
 
     # add ARM assembly/intrinsic files here
-    set(A_SRCS asm.S mc-a.S mc-a-common.S sad-a.S sad-a-common.S pixel-util.S pixel-util-common.S p2s.S p2s-common.S ipfilter.S ipfilter-common.S blockcopy8.S blockcopy8-common.S ssd-a.S ssd-a-common.S)
-    set(A_SRCS_SVE asm-sve.S blockcopy8-sve.S p2s-sve.S pixel-util-sve.S ssd-a-sve.S)
-    set(A_SRCS_SVE2 mc-a-sve2.S sad-a-sve2.S pixel-util-sve2.S ipfilter-sve2.S ssd-a-sve2.S)
+    set(A_SRCS asm.S mc-a.S mc-a-common.S sad-a.S pixel-util.S pixel-util-common.S p2s.S p2s-common.S blockcopy8.S blockcopy8-common.S ssd-a.S ssd-a-common.S intrapred.S dct.S)
+    set(A_SRCS_SVE asm-sve.S blockcopy8-sve.S p2s-sve.S pixel-util-sve.S)
+    set(A_SRCS_SVE2 mc-a-sve2.S pixel-util-sve2.S ssd-a-sve2.S)
+    set(A_SRCS_NEON_DOTPROD sad-neon-dotprod.S ssd-neon-dotprod.S)
     set(VEC_PRIMITIVES)
 
     set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
     set(ARM_ASMS_SVE "${A_SRCS_SVE}" CACHE INTERNAL "ARM Assembly Sources that use SVE instruction set")
     set(ARM_ASMS_SVE2 "${A_SRCS_SVE2}" CACHE INTERNAL "ARM Assembly Sources that use SVE2 instruction set")
-    foreach(SRC ${C_SRCS})
+    set(ARM_ASMS_NEON_DOTPROD "${A_SRCS_NEON_DOTPROD}" CACHE INTERNAL "Arm Assembly Sources that use the Neon DotProd extension")
+    foreach(SRC ${C_SRCS_NEON})
         set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
     endforeach()
+
+    if(CPU_HAS_NEON_I8MM)
+        foreach(SRC ${C_SRCS_NEON_I8MM})
+            set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
+        endforeach()
+    endif()
+
+    if(CPU_HAS_NEON_DOTPROD)
+        foreach(SRC ${C_SRCS_NEON_DOTPROD})
+            set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
+        endforeach()
+    endif()
+
+    if(CPU_HAS_SVE AND HAVE_SVE_BRIDGE)
+        foreach(SRC ${C_SRCS_SVE})
+            set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
+        endforeach()
+    endif()
+
+    if(CPU_HAS_SVE2 AND HAVE_SVE_BRIDGE)
+        foreach(SRC ${C_SRCS_SVE2})
+            set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
+        endforeach()
+    endif()
+
     source_group(Assembly FILES ${ASM_PRIMITIVES})
+
+    if(AARCH64_WARNINGS_AS_ERRORS)
+        set_source_files_properties(${ASM_PRIMITIVES} PROPERTIES COMPILE_FLAGS -Werror)
+    endif()
 endif(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64))
 
 if(POWER)
​

x265_3.6.tar.gz/source/common/aarch64/arm64-utils.cpp -> x265_4.0.tar.gz/source/common/aarch64/arm64-utils.cpp Changed

@@ -3,7 +3,6 @@
 #include "arm64-utils.h"
 #include <arm_neon.h>
 
-#define COPY_16(d,s) *(uint8x16_t *)(d) = *(uint8x16_t *)(s)
 namespace X265_NS
 {
 
@@ -11,53 +10,58 @@
 
 void transpose8x8(uint8_t *dst, const uint8_t *src, intptr_t dstride, intptr_t sstride)
 {
-    uint8x8_t a0, a1, a2, a3, a4, a5, a6, a7;
-    uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7;
-
-    a0 = *(uint8x8_t *)(src + 0 * sstride);
-    a1 = *(uint8x8_t *)(src + 1 * sstride);
-    a2 = *(uint8x8_t *)(src + 2 * sstride);
-    a3 = *(uint8x8_t *)(src + 3 * sstride);
-    a4 = *(uint8x8_t *)(src + 4 * sstride);
-    a5 = *(uint8x8_t *)(src + 5 * sstride);
-    a6 = *(uint8x8_t *)(src + 6 * sstride);
-    a7 = *(uint8x8_t *)(src + 7 * sstride);
-
-    b0 = vtrn1_u32(a0, a4);
-    b1 = vtrn1_u32(a1, a5);
-    b2 = vtrn1_u32(a2, a6);
-    b3 = vtrn1_u32(a3, a7);
-    b4 = vtrn2_u32(a0, a4);
-    b5 = vtrn2_u32(a1, a5);
-    b6 = vtrn2_u32(a2, a6);
-    b7 = vtrn2_u32(a3, a7);
-
-    a0 = vtrn1_u16(b0, b2);
-    a1 = vtrn1_u16(b1, b3);
-    a2 = vtrn2_u16(b0, b2);
-    a3 = vtrn2_u16(b1, b3);
-    a4 = vtrn1_u16(b4, b6);
-    a5 = vtrn1_u16(b5, b7);
-    a6 = vtrn2_u16(b4, b6);
-    a7 = vtrn2_u16(b5, b7);
-
-    b0 = vtrn1_u8(a0, a1);
-    b1 = vtrn2_u8(a0, a1);
-    b2 = vtrn1_u8(a2, a3);
-    b3 = vtrn2_u8(a2, a3);
-    b4 = vtrn1_u8(a4, a5);
-    b5 = vtrn2_u8(a4, a5);
-    b6 = vtrn1_u8(a6, a7);
-    b7 = vtrn2_u8(a6, a7);
-
-    *(uint8x8_t *)(dst + 0 * dstride) = b0;
-    *(uint8x8_t *)(dst + 1 * dstride) = b1;
-    *(uint8x8_t *)(dst + 2 * dstride) = b2;
-    *(uint8x8_t *)(dst + 3 * dstride) = b3;
-    *(uint8x8_t *)(dst + 4 * dstride) = b4;
-    *(uint8x8_t *)(dst + 5 * dstride) = b5;
-    *(uint8x8_t *)(dst + 6 * dstride) = b6;
-    *(uint8x8_t *)(dst + 7 * dstride) = b7;
+    uint8x8_t a0 = vld1_u8(src + 0 * sstride);
+    uint8x8_t a1 = vld1_u8(src + 1 * sstride);
+    uint8x8_t a2 = vld1_u8(src + 2 * sstride);
+    uint8x8_t a3 = vld1_u8(src + 3 * sstride);
+    uint8x8_t a4 = vld1_u8(src + 4 * sstride);
+    uint8x8_t a5 = vld1_u8(src + 5 * sstride);
+    uint8x8_t a6 = vld1_u8(src + 6 * sstride);
+    uint8x8_t a7 = vld1_u8(src + 7 * sstride);
+
+    uint32x2_t b0 = vtrn1_u32(vreinterpret_u32_u8(a0), vreinterpret_u32_u8(a4));
+    uint32x2_t b1 = vtrn1_u32(vreinterpret_u32_u8(a1), vreinterpret_u32_u8(a5));
+    uint32x2_t b2 = vtrn1_u32(vreinterpret_u32_u8(a2), vreinterpret_u32_u8(a6));
+    uint32x2_t b3 = vtrn1_u32(vreinterpret_u32_u8(a3), vreinterpret_u32_u8(a7));
+    uint32x2_t b4 = vtrn2_u32(vreinterpret_u32_u8(a0), vreinterpret_u32_u8(a4));
+    uint32x2_t b5 = vtrn2_u32(vreinterpret_u32_u8(a1), vreinterpret_u32_u8(a5));
+    uint32x2_t b6 = vtrn2_u32(vreinterpret_u32_u8(a2), vreinterpret_u32_u8(a6));
+    uint32x2_t b7 = vtrn2_u32(vreinterpret_u32_u8(a3), vreinterpret_u32_u8(a7));
+
+    uint16x4_t c0 = vtrn1_u16(vreinterpret_u16_u32(b0),
+                              vreinterpret_u16_u32(b2));
+    uint16x4_t c1 = vtrn1_u16(vreinterpret_u16_u32(b1),
+                              vreinterpret_u16_u32(b3));
+    uint16x4_t c2 = vtrn2_u16(vreinterpret_u16_u32(b0),
+                              vreinterpret_u16_u32(b2));
+    uint16x4_t c3 = vtrn2_u16(vreinterpret_u16_u32(b1),
+                              vreinterpret_u16_u32(b3));
+    uint16x4_t c4 = vtrn1_u16(vreinterpret_u16_u32(b4),
+                              vreinterpret_u16_u32(b6));
+    uint16x4_t c5 = vtrn1_u16(vreinterpret_u16_u32(b5),
+                              vreinterpret_u16_u32(b7));
+    uint16x4_t c6 = vtrn2_u16(vreinterpret_u16_u32(b4),
+                              vreinterpret_u16_u32(b6));
+    uint16x4_t c7 = vtrn2_u16(vreinterpret_u16_u32(b5),
+                              vreinterpret_u16_u32(b7));
+
+    uint8x8_t d0 = vtrn1_u8(vreinterpret_u8_u16(c0), vreinterpret_u8_u16(c1));
+    uint8x8_t d1 = vtrn2_u8(vreinterpret_u8_u16(c0), vreinterpret_u8_u16(c1));
+    uint8x8_t d2 = vtrn1_u8(vreinterpret_u8_u16(c2), vreinterpret_u8_u16(c3));
+    uint8x8_t d3 = vtrn2_u8(vreinterpret_u8_u16(c2), vreinterpret_u8_u16(c3));
+    uint8x8_t d4 = vtrn1_u8(vreinterpret_u8_u16(c4), vreinterpret_u8_u16(c5));
+    uint8x8_t d5 = vtrn2_u8(vreinterpret_u8_u16(c4), vreinterpret_u8_u16(c5));
+    uint8x8_t d6 = vtrn1_u8(vreinterpret_u8_u16(c6), vreinterpret_u8_u16(c7));
+    uint8x8_t d7 = vtrn2_u8(vreinterpret_u8_u16(c6), vreinterpret_u8_u16(c7));
+
+    vst1_u8(dst + 0 * dstride, d0);
+    vst1_u8(dst + 1 * dstride, d1);
+    vst1_u8(dst + 2 * dstride, d2);
+    vst1_u8(dst + 3 * dstride, d3);
+    vst1_u8(dst + 4 * dstride, d4);
+    vst1_u8(dst + 5 * dstride, d5);
+    vst1_u8(dst + 6 * dstride, d6);
+    vst1_u8(dst + 7 * dstride, d7);
 }
 
 
@@ -67,97 +71,171 @@
 
 void transpose16x16(uint8_t *dst, const uint8_t *src, intptr_t dstride, intptr_t sstride)
 {
-    uint16x8_t a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, aA, aB, aC, aD, aE, aF;
-    uint16x8_t b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, bA, bB, bC, bD, bE, bF;
-    uint16x8_t c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF;
-    uint16x8_t d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, dA, dB, dC, dD, dE, dF;
-
-    a0 = *(uint16x8_t *)(src + 0 * sstride);
-    a1 = *(uint16x8_t *)(src + 1 * sstride);
-    a2 = *(uint16x8_t *)(src + 2 * sstride);
-    a3 = *(uint16x8_t *)(src + 3 * sstride);
-    a4 = *(uint16x8_t *)(src + 4 * sstride);
-    a5 = *(uint16x8_t *)(src + 5 * sstride);
-    a6 = *(uint16x8_t *)(src + 6 * sstride);
-    a7 = *(uint16x8_t *)(src + 7 * sstride);
-    a8 = *(uint16x8_t *)(src + 8 * sstride);
-    a9 = *(uint16x8_t *)(src + 9 * sstride);
-    aA = *(uint16x8_t *)(src + 10 * sstride);
-    aB = *(uint16x8_t *)(src + 11 * sstride);
-    aC = *(uint16x8_t *)(src + 12 * sstride);
-    aD = *(uint16x8_t *)(src + 13 * sstride);
-    aE = *(uint16x8_t *)(src + 14 * sstride);
-    aF = *(uint16x8_t *)(src + 15 * sstride);
-
-    b0 = vtrn1q_u64(a0, a8);
-    b1 = vtrn1q_u64(a1, a9);
-    b2 = vtrn1q_u64(a2, aA);
-    b3 = vtrn1q_u64(a3, aB);
-    b4 = vtrn1q_u64(a4, aC);
-    b5 = vtrn1q_u64(a5, aD);
-    b6 = vtrn1q_u64(a6, aE);
-    b7 = vtrn1q_u64(a7, aF);
-    b8 = vtrn2q_u64(a0, a8);
-    b9 = vtrn2q_u64(a1, a9);
-    bA = vtrn2q_u64(a2, aA);
-    bB = vtrn2q_u64(a3, aB);
-    bC = vtrn2q_u64(a4, aC);
-    bD = vtrn2q_u64(a5, aD);
-    bE = vtrn2q_u64(a6, aE);
-    bF = vtrn2q_u64(a7, aF);
-
-    c0 = vtrn1q_u32(b0, b4);
-    c1 = vtrn1q_u32(b1, b5);
-    c2 = vtrn1q_u32(b2, b6);
-    c3 = vtrn1q_u32(b3, b7);
-    c4 = vtrn2q_u32(b0, b4);
-    c5 = vtrn2q_u32(b1, b5);
-    c6 = vtrn2q_u32(b2, b6);
-    c7 = vtrn2q_u32(b3, b7);
-    c8 = vtrn1q_u32(b8, bC);
-    c9 = vtrn1q_u32(b9, bD);
-    cA = vtrn1q_u32(bA, bE);
-    cB = vtrn1q_u32(bB, bF);
-    cC = vtrn2q_u32(b8, bC);
-    cD = vtrn2q_u32(b9, bD);
-    cE = vtrn2q_u32(bA, bE);
-    cF = vtrn2q_u32(bB, bF);
-
-    d0 = vtrn1q_u16(c0, c2);
-    d1 = vtrn1q_u16(c1, c3);
-    d2 = vtrn2q_u16(c0, c2);
-    d3 = vtrn2q_u16(c1, c3);
-    d4 = vtrn1q_u16(c4, c6);
-    d5 = vtrn1q_u16(c5, c7);
-    d6 = vtrn2q_u16(c4, c6);
-    d7 = vtrn2q_u16(c5, c7);
-    d8 = vtrn1q_u16(c8, cA);
-    d9 = vtrn1q_u16(c9, cB);
-    dA = vtrn2q_u16(c8, cA);
-    dB = vtrn2q_u16(c9, cB);
-    dC = vtrn1q_u16(cC, cE);
-    dD = vtrn1q_u16(cD, cF);
-    dE = vtrn2q_u16(cC, cE);
-    dF = vtrn2q_u16(cD, cF);
-
-    *(uint16x8_t *)(dst + 0 * dstride)  = vtrn1q_u8(d0, d1);
-    *(uint16x8_t *)(dst + 1 * dstride)  = vtrn2q_u8(d0, d1);
-    *(uint16x8_t *)(dst + 2 * dstride)  = vtrn1q_u8(d2, d3);
-    *(uint16x8_t *)(dst + 3 * dstride)  = vtrn2q_u8(d2, d3);
-    *(uint16x8_t *)(dst + 4 * dstride)  = vtrn1q_u8(d4, d5);
-    *(uint16x8_t *)(dst + 5 * dstride)  = vtrn2q_u8(d4, d5);
-    *(uint16x8_t *)(dst + 6 * dstride)  = vtrn1q_u8(d6, d7);
-    *(uint16x8_t *)(dst + 7 * dstride)  = vtrn2q_u8(d6, d7);
-    *(uint16x8_t *)(dst + 8 * dstride)  = vtrn1q_u8(d8, d9);

 
@@ -3,7 +3,6 @@
 #include "arm64-utils.h"
 #include <arm_neon.h>
 
-#define COPY_16(d,s) *(uint8x16_t *)(d) = *(uint8x16_t *)(s)
 namespace X265_NS
 {
 
@@ -11,53 +10,58 @@
 
 void transpose8x8(uint8_t *dst, const uint8_t *src, intptr_t dstride, intptr_t sstride)
 {
-    uint8x8_t a0, a1, a2, a3, a4, a5, a6, a7;
-    uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7;
-
-    a0 = *(uint8x8_t *)(src + 0 * sstride);
-    a1 = *(uint8x8_t *)(src + 1 * sstride);
-    a2 = *(uint8x8_t *)(src + 2 * sstride);
-    a3 = *(uint8x8_t *)(src + 3 * sstride);
-    a4 = *(uint8x8_t *)(src + 4 * sstride);
-    a5 = *(uint8x8_t *)(src + 5 * sstride);
-    a6 = *(uint8x8_t *)(src + 6 * sstride);
-    a7 = *(uint8x8_t *)(src + 7 * sstride);
-
-    b0 = vtrn1_u32(a0, a4);
-    b1 = vtrn1_u32(a1, a5);
-    b2 = vtrn1_u32(a2, a6);
-    b3 = vtrn1_u32(a3, a7);
-    b4 = vtrn2_u32(a0, a4);
-    b5 = vtrn2_u32(a1, a5);
-    b6 = vtrn2_u32(a2, a6);
-    b7 = vtrn2_u32(a3, a7);
-
-    a0 = vtrn1_u16(b0, b2);
-    a1 = vtrn1_u16(b1, b3);
-    a2 = vtrn2_u16(b0, b2);
-    a3 = vtrn2_u16(b1, b3);
-    a4 = vtrn1_u16(b4, b6);
-    a5 = vtrn1_u16(b5, b7);
-    a6 = vtrn2_u16(b4, b6);
-    a7 = vtrn2_u16(b5, b7);
-
-    b0 = vtrn1_u8(a0, a1);
-    b1 = vtrn2_u8(a0, a1);
-    b2 = vtrn1_u8(a2, a3);
-    b3 = vtrn2_u8(a2, a3);
-    b4 = vtrn1_u8(a4, a5);
-    b5 = vtrn2_u8(a4, a5);
-    b6 = vtrn1_u8(a6, a7);
-    b7 = vtrn2_u8(a6, a7);
-
-    *(uint8x8_t *)(dst + 0 * dstride) = b0;
-    *(uint8x8_t *)(dst + 1 * dstride) = b1;
-    *(uint8x8_t *)(dst + 2 * dstride) = b2;
-    *(uint8x8_t *)(dst + 3 * dstride) = b3;
-    *(uint8x8_t *)(dst + 4 * dstride) = b4;
-    *(uint8x8_t *)(dst + 5 * dstride) = b5;
-    *(uint8x8_t *)(dst + 6 * dstride) = b6;
-    *(uint8x8_t *)(dst + 7 * dstride) = b7;
+    uint8x8_t a0 = vld1_u8(src + 0 * sstride);
+    uint8x8_t a1 = vld1_u8(src + 1 * sstride);
+    uint8x8_t a2 = vld1_u8(src + 2 * sstride);
+    uint8x8_t a3 = vld1_u8(src + 3 * sstride);
+    uint8x8_t a4 = vld1_u8(src + 4 * sstride);
+    uint8x8_t a5 = vld1_u8(src + 5 * sstride);
+    uint8x8_t a6 = vld1_u8(src + 6 * sstride);
+    uint8x8_t a7 = vld1_u8(src + 7 * sstride);
+
+    uint32x2_t b0 = vtrn1_u32(vreinterpret_u32_u8(a0), vreinterpret_u32_u8(a4));
+    uint32x2_t b1 = vtrn1_u32(vreinterpret_u32_u8(a1), vreinterpret_u32_u8(a5));
+    uint32x2_t b2 = vtrn1_u32(vreinterpret_u32_u8(a2), vreinterpret_u32_u8(a6));
+    uint32x2_t b3 = vtrn1_u32(vreinterpret_u32_u8(a3), vreinterpret_u32_u8(a7));
+    uint32x2_t b4 = vtrn2_u32(vreinterpret_u32_u8(a0), vreinterpret_u32_u8(a4));
+    uint32x2_t b5 = vtrn2_u32(vreinterpret_u32_u8(a1), vreinterpret_u32_u8(a5));
+    uint32x2_t b6 = vtrn2_u32(vreinterpret_u32_u8(a2), vreinterpret_u32_u8(a6));
+    uint32x2_t b7 = vtrn2_u32(vreinterpret_u32_u8(a3), vreinterpret_u32_u8(a7));
+
+    uint16x4_t c0 = vtrn1_u16(vreinterpret_u16_u32(b0),
+                              vreinterpret_u16_u32(b2));
+    uint16x4_t c1 = vtrn1_u16(vreinterpret_u16_u32(b1),
+                              vreinterpret_u16_u32(b3));
+    uint16x4_t c2 = vtrn2_u16(vreinterpret_u16_u32(b0),
+                              vreinterpret_u16_u32(b2));
+    uint16x4_t c3 = vtrn2_u16(vreinterpret_u16_u32(b1),
+                              vreinterpret_u16_u32(b3));
+    uint16x4_t c4 = vtrn1_u16(vreinterpret_u16_u32(b4),
+                              vreinterpret_u16_u32(b6));
+    uint16x4_t c5 = vtrn1_u16(vreinterpret_u16_u32(b5),
+                              vreinterpret_u16_u32(b7));
+    uint16x4_t c6 = vtrn2_u16(vreinterpret_u16_u32(b4),
+                              vreinterpret_u16_u32(b6));
+    uint16x4_t c7 = vtrn2_u16(vreinterpret_u16_u32(b5),
+                              vreinterpret_u16_u32(b7));
+
+    uint8x8_t d0 = vtrn1_u8(vreinterpret_u8_u16(c0), vreinterpret_u8_u16(c1));
+    uint8x8_t d1 = vtrn2_u8(vreinterpret_u8_u16(c0), vreinterpret_u8_u16(c1));
+    uint8x8_t d2 = vtrn1_u8(vreinterpret_u8_u16(c2), vreinterpret_u8_u16(c3));
+    uint8x8_t d3 = vtrn2_u8(vreinterpret_u8_u16(c2), vreinterpret_u8_u16(c3));
+    uint8x8_t d4 = vtrn1_u8(vreinterpret_u8_u16(c4), vreinterpret_u8_u16(c5));
+    uint8x8_t d5 = vtrn2_u8(vreinterpret_u8_u16(c4), vreinterpret_u8_u16(c5));
+    uint8x8_t d6 = vtrn1_u8(vreinterpret_u8_u16(c6), vreinterpret_u8_u16(c7));
+    uint8x8_t d7 = vtrn2_u8(vreinterpret_u8_u16(c6), vreinterpret_u8_u16(c7));
+
+    vst1_u8(dst + 0 * dstride, d0);
+    vst1_u8(dst + 1 * dstride, d1);
+    vst1_u8(dst + 2 * dstride, d2);
+    vst1_u8(dst + 3 * dstride, d3);
+    vst1_u8(dst + 4 * dstride, d4);
+    vst1_u8(dst + 5 * dstride, d5);
+    vst1_u8(dst + 6 * dstride, d6);
+    vst1_u8(dst + 7 * dstride, d7);
 }
 
 
@@ -67,97 +71,171 @@
 
 void transpose16x16(uint8_t *dst, const uint8_t *src, intptr_t dstride, intptr_t sstride)
 {
-    uint16x8_t a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, aA, aB, aC, aD, aE, aF;
-    uint16x8_t b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, bA, bB, bC, bD, bE, bF;
-    uint16x8_t c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF;
-    uint16x8_t d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, dA, dB, dC, dD, dE, dF;
-
-    a0 = *(uint16x8_t *)(src + 0 * sstride);
-    a1 = *(uint16x8_t *)(src + 1 * sstride);
-    a2 = *(uint16x8_t *)(src + 2 * sstride);
-    a3 = *(uint16x8_t *)(src + 3 * sstride);
-    a4 = *(uint16x8_t *)(src + 4 * sstride);
-    a5 = *(uint16x8_t *)(src + 5 * sstride);
-    a6 = *(uint16x8_t *)(src + 6 * sstride);
-    a7 = *(uint16x8_t *)(src + 7 * sstride);
-    a8 = *(uint16x8_t *)(src + 8 * sstride);
-    a9 = *(uint16x8_t *)(src + 9 * sstride);
-    aA = *(uint16x8_t *)(src + 10 * sstride);
-    aB = *(uint16x8_t *)(src + 11 * sstride);
-    aC = *(uint16x8_t *)(src + 12 * sstride);
-    aD = *(uint16x8_t *)(src + 13 * sstride);
-    aE = *(uint16x8_t *)(src + 14 * sstride);
-    aF = *(uint16x8_t *)(src + 15 * sstride);
-
-    b0 = vtrn1q_u64(a0, a8);
-    b1 = vtrn1q_u64(a1, a9);
-    b2 = vtrn1q_u64(a2, aA);
-    b3 = vtrn1q_u64(a3, aB);
-    b4 = vtrn1q_u64(a4, aC);
-    b5 = vtrn1q_u64(a5, aD);
-    b6 = vtrn1q_u64(a6, aE);
-    b7 = vtrn1q_u64(a7, aF);
-    b8 = vtrn2q_u64(a0, a8);
-    b9 = vtrn2q_u64(a1, a9);
-    bA = vtrn2q_u64(a2, aA);
-    bB = vtrn2q_u64(a3, aB);
-    bC = vtrn2q_u64(a4, aC);
-    bD = vtrn2q_u64(a5, aD);
-    bE = vtrn2q_u64(a6, aE);
-    bF = vtrn2q_u64(a7, aF);
-
-    c0 = vtrn1q_u32(b0, b4);
-    c1 = vtrn1q_u32(b1, b5);
-    c2 = vtrn1q_u32(b2, b6);
-    c3 = vtrn1q_u32(b3, b7);
-    c4 = vtrn2q_u32(b0, b4);
-    c5 = vtrn2q_u32(b1, b5);
-    c6 = vtrn2q_u32(b2, b6);
-    c7 = vtrn2q_u32(b3, b7);
-    c8 = vtrn1q_u32(b8, bC);
-    c9 = vtrn1q_u32(b9, bD);
-    cA = vtrn1q_u32(bA, bE);
-    cB = vtrn1q_u32(bB, bF);
-    cC = vtrn2q_u32(b8, bC);
-    cD = vtrn2q_u32(b9, bD);
-    cE = vtrn2q_u32(bA, bE);
-    cF = vtrn2q_u32(bB, bF);
-
-    d0 = vtrn1q_u16(c0, c2);
-    d1 = vtrn1q_u16(c1, c3);
-    d2 = vtrn2q_u16(c0, c2);
-    d3 = vtrn2q_u16(c1, c3);
-    d4 = vtrn1q_u16(c4, c6);
-    d5 = vtrn1q_u16(c5, c7);
-    d6 = vtrn2q_u16(c4, c6);
-    d7 = vtrn2q_u16(c5, c7);
-    d8 = vtrn1q_u16(c8, cA);
-    d9 = vtrn1q_u16(c9, cB);
-    dA = vtrn2q_u16(c8, cA);
-    dB = vtrn2q_u16(c9, cB);
-    dC = vtrn1q_u16(cC, cE);
-    dD = vtrn1q_u16(cD, cF);
-    dE = vtrn2q_u16(cC, cE);
-    dF = vtrn2q_u16(cD, cF);
-
-    *(uint16x8_t *)(dst + 0 * dstride)  = vtrn1q_u8(d0, d1);
-    *(uint16x8_t *)(dst + 1 * dstride)  = vtrn2q_u8(d0, d1);
-    *(uint16x8_t *)(dst + 2 * dstride)  = vtrn1q_u8(d2, d3);
-    *(uint16x8_t *)(dst + 3 * dstride)  = vtrn2q_u8(d2, d3);
-    *(uint16x8_t *)(dst + 4 * dstride)  = vtrn1q_u8(d4, d5);
-    *(uint16x8_t *)(dst + 5 * dstride)  = vtrn2q_u8(d4, d5);
-    *(uint16x8_t *)(dst + 6 * dstride)  = vtrn1q_u8(d6, d7);
-    *(uint16x8_t *)(dst + 7 * dstride)  = vtrn2q_u8(d6, d7);
-    *(uint16x8_t *)(dst + 8 * dstride)  = vtrn1q_u8(d8, d9);
​

x265_3.6.tar.gz/source/common/aarch64/arm64-utils.h -> x265_4.0.tar.gz/source/common/aarch64/arm64-utils.h Changed

 
@@ -1,6 +1,7 @@
 #ifndef __ARM64_UTILS_H__
 #define __ARM64_UTILS_H__
 
+#include <stdint.h>
 
 namespace X265_NS
 {
​

x265_3.6.tar.gz/source/common/aarch64/asm-primitives.cpp -> x265_4.0.tar.gz/source/common/aarch64/asm-primitives.cpp Changed

@@ -39,15 +39,9 @@
     p.cuBLOCK_16x16.prim = fncdef PFX(fname ## _16x16_ ## cpu); \
     p.cuBLOCK_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
     p.cuBLOCK_64x64.prim = fncdef PFX(fname ## _64x64_ ## cpu)
-#define LUMA_TU_TYPED_NEON(prim, fncdef, fname) \
-    p.cuBLOCK_4x4.prim   = fncdef PFX(fname ## _4x4_ ## neon); \
-    p.cuBLOCK_8x8.prim   = fncdef PFX(fname ## _8x8_ ## neon); \
-    p.cuBLOCK_16x16.prim = fncdef PFX(fname ## _16x16_ ## neon); \
-    p.cuBLOCK_64x64.prim = fncdef PFX(fname ## _64x64_ ## neon)
 #define LUMA_TU_TYPED_CAN_USE_SVE(prim, fncdef, fname) \
     p.cuBLOCK_32x32.prim = fncdef PFX(fname ## _32x32_ ## sve)
 #define ALL_LUMA_TU(prim, fname, cpu)      ALL_LUMA_TU_TYPED(prim, , fname, cpu)
-#define LUMA_TU_NEON(prim, fname)      LUMA_TU_TYPED_NEON(prim, , fname)
 #define LUMA_TU_CAN_USE_SVE(prim, fname)      LUMA_TU_TYPED_CAN_USE_SVE(prim, , fname)
 
 #define ALL_LUMA_PU_TYPED(prim, fncdef, fname, cpu) \
@@ -76,50 +70,6 @@
     p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## cpu); \
     p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## cpu); \
     p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## cpu)
-#define LUMA_PU_TYPED_MULTIPLE_ARCHS_1(prim, fncdef, fname, cpu) \
-    p.puLUMA_4x4.prim   = fncdef PFX(fname ## _4x4_ ## cpu); \
-    p.puLUMA_4x8.prim   = fncdef PFX(fname ## _4x8_ ## cpu); \
-    p.puLUMA_4x16.prim  = fncdef PFX(fname ## _4x16_ ## cpu)
-#define LUMA_PU_TYPED_MULTIPLE_ARCHS_2(prim, fncdef, fname, cpu) \
-    p.puLUMA_8x8.prim   = fncdef PFX(fname ## _8x8_ ## cpu); \
-    p.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## cpu); \
-    p.puLUMA_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
-    p.puLUMA_64x64.prim = fncdef PFX(fname ## _64x64_ ## cpu); \
-    p.puLUMA_8x4.prim   = fncdef PFX(fname ## _8x4_ ## cpu); \
-    p.puLUMA_16x8.prim  = fncdef PFX(fname ## _16x8_ ## cpu); \
-    p.puLUMA_8x16.prim  = fncdef PFX(fname ## _8x16_ ## cpu); \
-    p.puLUMA_16x32.prim = fncdef PFX(fname ## _16x32_ ## cpu); \
-    p.puLUMA_32x16.prim = fncdef PFX(fname ## _32x16_ ## cpu); \
-    p.puLUMA_64x32.prim = fncdef PFX(fname ## _64x32_ ## cpu); \
-    p.puLUMA_32x64.prim = fncdef PFX(fname ## _32x64_ ## cpu); \
-    p.puLUMA_16x12.prim = fncdef PFX(fname ## _16x12_ ## cpu); \
-    p.puLUMA_12x16.prim = fncdef PFX(fname ## _12x16_ ## cpu); \
-    p.puLUMA_16x4.prim  = fncdef PFX(fname ## _16x4_ ## cpu); \
-    p.puLUMA_32x24.prim = fncdef PFX(fname ## _32x24_ ## cpu); \
-    p.puLUMA_24x32.prim = fncdef PFX(fname ## _24x32_ ## cpu); \
-    p.puLUMA_32x8.prim  = fncdef PFX(fname ## _32x8_ ## cpu); \
-    p.puLUMA_8x32.prim  = fncdef PFX(fname ## _8x32_ ## cpu); \
-    p.puLUMA_64x48.prim = fncdef PFX(fname ## _64x48_ ## cpu); \
-    p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## cpu); \
-    p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## cpu); \
-    p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## cpu)
-#define LUMA_PU_TYPED_NEON_1(prim, fncdef, fname) \
-    p.puLUMA_4x4.prim   = fncdef PFX(fname ## _4x4_ ## neon); \
-    p.puLUMA_4x8.prim   = fncdef PFX(fname ## _4x8_ ## neon); \
-    p.puLUMA_4x16.prim  = fncdef PFX(fname ## _4x16_ ## neon); \
-    p.puLUMA_12x16.prim = fncdef PFX(fname ## _12x16_ ## neon); \
-    p.puLUMA_8x8.prim   = fncdef PFX(fname ## _8x8_ ## neon); \
-    p.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## neon); \
-    p.puLUMA_8x4.prim   = fncdef PFX(fname ## _8x4_ ## neon); \
-    p.puLUMA_16x8.prim  = fncdef PFX(fname ## _16x8_ ## neon); \
-    p.puLUMA_8x16.prim  = fncdef PFX(fname ## _8x16_ ## neon); \
-    p.puLUMA_16x12.prim = fncdef PFX(fname ## _16x12_ ## neon); \
-    p.puLUMA_16x32.prim = fncdef PFX(fname ## _16x32_ ## neon); \
-    p.puLUMA_16x4.prim  = fncdef PFX(fname ## _16x4_ ## neon); \
-    p.puLUMA_24x32.prim = fncdef PFX(fname ## _24x32_ ## neon); \
-    p.puLUMA_8x32.prim  = fncdef PFX(fname ## _8x32_ ## neon); \
-    p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## neon); \
-    p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## neon)
 #define LUMA_PU_TYPED_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(prim, fncdef, fname) \
     p.puLUMA_32x32.prim = fncdef PFX(fname ## _32x32_ ## sve); \
     p.puLUMA_64x64.prim = fncdef PFX(fname ## _64x64_ ## sve); \
@@ -130,20 +80,6 @@
     p.puLUMA_32x8.prim  = fncdef PFX(fname ## _32x8_ ## sve); \
     p.puLUMA_64x48.prim = fncdef PFX(fname ## _64x48_ ## sve); \
     p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## sve)
-#define LUMA_PU_TYPED_NEON_2(prim, fncdef, fname) \
-    p.puLUMA_4x4.prim   = fncdef PFX(fname ## _4x4_ ## neon); \
-    p.puLUMA_8x4.prim   = fncdef PFX(fname ## _8x4_ ## neon); \
-    p.puLUMA_4x8.prim   = fncdef PFX(fname ## _4x8_ ## neon); \
-    p.puLUMA_8x8.prim   = fncdef PFX(fname ## _8x8_ ## neon); \
-    p.puLUMA_16x8.prim  = fncdef PFX(fname ## _16x8_ ## neon); \
-    p.puLUMA_8x16.prim  = fncdef PFX(fname ## _8x16_ ## neon); \
-    p.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## neon); \
-    p.puLUMA_16x32.prim = fncdef PFX(fname ## _16x32_ ## neon); \
-    p.puLUMA_16x12.prim = fncdef PFX(fname ## _16x12_ ## neon); \
-    p.puLUMA_16x4.prim  = fncdef PFX(fname ## _16x4_ ## neon); \
-    p.puLUMA_4x16.prim  = fncdef PFX(fname ## _4x16_ ## neon); \
-    p.puLUMA_8x32.prim  = fncdef PFX(fname ## _8x32_ ## neon); \
-    p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## neon)
 #define LUMA_PU_TYPED_MULTIPLE_ARCHS_3(prim, fncdef, fname, cpu) \
     p.puLUMA_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
     p.puLUMA_64x64.prim = fncdef PFX(fname ## _64x64_ ## cpu); \
@@ -157,10 +93,6 @@
     p.puLUMA_64x48.prim = fncdef PFX(fname ## _64x48_ ## cpu); \
     p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## cpu); \
     p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## cpu)
-#define LUMA_PU_TYPED_NEON_3(prim, fncdef, fname) \
-    p.puLUMA_4x4.prim   = fncdef PFX(fname ## _4x4_ ## neon); \
-    p.puLUMA_4x8.prim   = fncdef PFX(fname ## _4x8_ ## neon); \
-    p.puLUMA_4x16.prim  = fncdef PFX(fname ## _4x16_ ## neon)
 #define LUMA_PU_TYPED_CAN_USE_SVE2(prim, fncdef, fname) \
     p.puLUMA_8x8.prim   = fncdef PFX(fname ## _8x8_ ## sve2); \
     p.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## sve2); \
@@ -184,22 +116,6 @@
     p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## sve2); \
     p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## sve2); \
     p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## sve2)
-#define LUMA_PU_TYPED_NEON_FILTER_PIXEL_TO_SHORT(prim, fncdef) \
-    p.puLUMA_4x4.prim   = fncdef PFX(filterPixelToShort ## _4x4_ ## neon); \
-    p.puLUMA_8x8.prim   = fncdef PFX(filterPixelToShort ## _8x8_ ## neon); \
-    p.puLUMA_16x16.prim = fncdef PFX(filterPixelToShort ## _16x16_ ## neon); \
-    p.puLUMA_8x4.prim   = fncdef PFX(filterPixelToShort ## _8x4_ ## neon); \
-    p.puLUMA_4x8.prim   = fncdef PFX(filterPixelToShort ## _4x8_ ## neon); \
-    p.puLUMA_16x8.prim  = fncdef PFX(filterPixelToShort ## _16x8_ ## neon); \
-    p.puLUMA_8x16.prim  = fncdef PFX(filterPixelToShort ## _8x16_ ## neon); \
-    p.puLUMA_16x32.prim = fncdef PFX(filterPixelToShort ## _16x32_ ## neon); \
-    p.puLUMA_16x12.prim = fncdef PFX(filterPixelToShort ## _16x12_ ## neon); \
-    p.puLUMA_12x16.prim = fncdef PFX(filterPixelToShort ## _12x16_ ## neon); \
-    p.puLUMA_16x4.prim  = fncdef PFX(filterPixelToShort ## _16x4_ ## neon); \
-    p.puLUMA_4x16.prim  = fncdef PFX(filterPixelToShort ## _4x16_ ## neon); \
-    p.puLUMA_24x32.prim = fncdef PFX(filterPixelToShort ## _24x32_ ## neon); \
-    p.puLUMA_8x32.prim  = fncdef PFX(filterPixelToShort ## _8x32_ ## neon); \
-    p.puLUMA_16x64.prim = fncdef PFX(filterPixelToShort ## _16x64_ ## neon)
 #define LUMA_PU_TYPED_SVE_FILTER_PIXEL_TO_SHORT(prim, fncdef) \
     p.puLUMA_32x32.prim = fncdef PFX(filterPixelToShort ## _32x32_ ## sve); \
     p.puLUMA_32x16.prim = fncdef PFX(filterPixelToShort ## _32x16_ ## sve); \
@@ -211,17 +127,29 @@
     p.puLUMA_64x48.prim = fncdef PFX(filterPixelToShort ## _64x48_ ## sve); \
     p.puLUMA_64x16.prim = fncdef PFX(filterPixelToShort ## _64x16_ ## sve); \
     p.puLUMA_48x64.prim = fncdef PFX(filterPixelToShort ## _48x64_ ## sve)
+#define LUMA_PU_TYPED_MULTIPLE_16(prim, fncdef, fname, cpu)      \
+    p.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## cpu); \
+    p.puLUMA_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
+    p.puLUMA_64x64.prim = fncdef PFX(fname ## _64x64_ ## cpu); \
+    p.puLUMA_16x8.prim  = fncdef PFX(fname ## _16x8_ ## cpu);  \
+    p.puLUMA_16x32.prim = fncdef PFX(fname ## _16x32_ ## cpu); \
+    p.puLUMA_32x16.prim = fncdef PFX(fname ## _32x16_ ## cpu); \
+    p.puLUMA_64x32.prim = fncdef PFX(fname ## _64x32_ ## cpu); \
+    p.puLUMA_32x64.prim = fncdef PFX(fname ## _32x64_ ## cpu); \
+    p.puLUMA_16x12.prim = fncdef PFX(fname ## _16x12_ ## cpu); \
+    p.puLUMA_16x4.prim  = fncdef PFX(fname ## _16x4_ ## cpu);  \
+    p.puLUMA_32x24.prim = fncdef PFX(fname ## _32x24_ ## cpu); \
+    p.puLUMA_32x8.prim  = fncdef PFX(fname ## _32x8_ ## cpu);  \
+    p.puLUMA_64x48.prim = fncdef PFX(fname ## _64x48_ ## cpu); \
+    p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## cpu); \
+    p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## cpu); \
+    p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## cpu)
 #define ALL_LUMA_PU(prim, fname, cpu) ALL_LUMA_PU_TYPED(prim, , fname, cpu)
-#define LUMA_PU_MULTIPLE_ARCHS_1(prim, fname, cpu) LUMA_PU_TYPED_MULTIPLE_ARCHS_1(prim, , fname, cpu)
-#define LUMA_PU_MULTIPLE_ARCHS_2(prim, fname, cpu) LUMA_PU_TYPED_MULTIPLE_ARCHS_2(prim, , fname, cpu)
-#define LUMA_PU_NEON_1(prim, fname) LUMA_PU_TYPED_NEON_1(prim, , fname)
 #define LUMA_PU_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(prim, fname) LUMA_PU_TYPED_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(prim, , fname)
-#define LUMA_PU_NEON_2(prim, fname) LUMA_PU_TYPED_NEON_2(prim, , fname)
 #define LUMA_PU_MULTIPLE_ARCHS_3(prim, fname, cpu) LUMA_PU_TYPED_MULTIPLE_ARCHS_3(prim, , fname, cpu)
-#define LUMA_PU_NEON_3(prim, fname) LUMA_PU_TYPED_NEON_3(prim, , fname)
 #define LUMA_PU_CAN_USE_SVE2(prim, fname) LUMA_PU_TYPED_CAN_USE_SVE2(prim, , fname)
-#define LUMA_PU_NEON_FILTER_PIXEL_TO_SHORT(prim) LUMA_PU_TYPED_NEON_FILTER_PIXEL_TO_SHORT(prim, )
 #define LUMA_PU_SVE_FILTER_PIXEL_TO_SHORT(prim) LUMA_PU_TYPED_SVE_FILTER_PIXEL_TO_SHORT(prim, )
+#define LUMA_PU_MULTIPLE_16(prim, fname, cpu) LUMA_PU_TYPED_MULTIPLE_16(prim, , fname, cpu)
 
 
 #define ALL_LUMA_PU_T(prim, fname) \
@@ -276,37 +204,9 @@
     p.chromaX265_CSP_I420.puCHROMA_420_24x32.prim = fncdef PFX(fname ## _24x32_ ## cpu); \
     p.chromaX265_CSP_I420.puCHROMA_420_32x8.prim  = fncdef PFX(fname ## _32x8_ ## cpu); \
     p.chromaX265_CSP_I420.puCHROMA_420_8x32.prim  = fncdef PFX(fname ## _8x32_ ## cpu)
-#define CHROMA_420_PU_TYPED_NEON_1(prim, fncdef, fname)               \
-    p.chromaX265_CSP_I420.puCHROMA_420_4x4.prim   = fncdef PFX(fname ## _4x4_ ## neon); \
-    p.chromaX265_CSP_I420.puCHROMA_420_4x2.prim   = fncdef PFX(fname ## _4x2_ ## neon); \
-    p.chromaX265_CSP_I420.puCHROMA_420_4x8.prim   = fncdef PFX(fname ## _4x8_ ## neon); \
-    p.chromaX265_CSP_I420.puCHROMA_420_6x8.prim   = fncdef PFX(fname ## _6x8_ ## neon); \
-    p.chromaX265_CSP_I420.puCHROMA_420_12x16.prim = fncdef PFX(fname ## _12x16_ ## neon); \
-    p.chromaX265_CSP_I420.puCHROMA_420_4x16.prim  = fncdef PFX(fname ## _4x16_ ## neon); \
-    p.chromaX265_CSP_I420.puCHROMA_420_32x24.prim = fncdef PFX(fname ## _32x24_ ## neon); \
-    p.chromaX265_CSP_I420.puCHROMA_420_24x32.prim = fncdef PFX(fname ## _24x32_ ## neon); \
-    p.chromaX265_CSP_I420.puCHROMA_420_32x8.prim  = fncdef PFX(fname ## _32x8_ ## neon); \
-    p.chromaX265_CSP_I420.puCHROMA_420_8x32.prim  = fncdef PFX(fname ## _8x32_ ## neon); \
-    p.chromaX265_CSP_I420.puCHROMA_420_8x8.prim   = fncdef PFX(fname ## _8x8_ ## neon); \
-    p.chromaX265_CSP_I420.puCHROMA_420_16x16.prim = fncdef PFX(fname ## _16x16_ ## neon); \
-    p.chromaX265_CSP_I420.puCHROMA_420_2x4.prim   = fncdef PFX(fname ## _2x4_ ## neon); \
-    p.chromaX265_CSP_I420.puCHROMA_420_8x4.prim   = fncdef PFX(fname ## _8x4_ ## neon); \
-    p.chromaX265_CSP_I420.puCHROMA_420_16x8.prim  = fncdef PFX(fname ## _16x8_ ## neon); \
-    p.chromaX265_CSP_I420.puCHROMA_420_8x16.prim  = fncdef PFX(fname ## _8x16_ ## neon); \
-    p.chromaX265_CSP_I420.puCHROMA_420_16x32.prim = fncdef PFX(fname ## _16x32_ ## neon); \
-    p.chromaX265_CSP_I420.puCHROMA_420_8x6.prim   = fncdef PFX(fname ## _8x6_ ## neon); \
-    p.chromaX265_CSP_I420.puCHROMA_420_8x2.prim   = fncdef PFX(fname ## _8x2_ ## neon); \
-    p.chromaX265_CSP_I420.puCHROMA_420_2x8.prim   = fncdef PFX(fname ## _2x8_ ## neon); \
-    p.chromaX265_CSP_I420.puCHROMA_420_16x12.prim = fncdef PFX(fname ## _16x12_ ## neon); \
-    p.chromaX265_CSP_I420.puCHROMA_420_16x4.prim  = fncdef PFX(fname ## _16x4_ ## neon)
 #define CHROMA_420_PU_TYPED_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(prim, fncdef, fname)               \
     p.chromaX265_CSP_I420.puCHROMA_420_32x32.prim = fncdef PFX(fname ## _32x32_ ## sve); \
     p.chromaX265_CSP_I420.puCHROMA_420_32x16.prim = fncdef PFX(fname ## _32x16_ ## sve)
-#define CHROMA_420_PU_TYPED_NEON_2(prim, fncdef, fname)               \
-    p.chromaX265_CSP_I420.puCHROMA_420_4x4.prim   = fncdef PFX(fname ## _4x4_ ## neon); \
-    p.chromaX265_CSP_I420.puCHROMA_420_4x2.prim   = fncdef PFX(fname ## _4x2_ ## neon); \
-    p.chromaX265_CSP_I420.puCHROMA_420_4x8.prim   = fncdef PFX(fname ## _4x8_ ## neon); \
-    p.chromaX265_CSP_I420.puCHROMA_420_4x16.prim  = fncdef PFX(fname ## _4x16_ ## neon)
 #define CHROMA_420_PU_TYPED_MULTIPLE_ARCHS(prim, fncdef, fname, cpu)               \
     p.chromaX265_CSP_I420.puCHROMA_420_8x8.prim   = fncdef PFX(fname ## _8x8_ ## cpu); \
     p.chromaX265_CSP_I420.puCHROMA_420_16x16.prim = fncdef PFX(fname ## _16x16_ ## cpu); \
@@ -328,23 +228,6 @@
     p.chromaX265_CSP_I420.puCHROMA_420_24x32.prim = fncdef PFX(fname ## _24x32_ ## cpu); \
     p.chromaX265_CSP_I420.puCHROMA_420_32x8.prim  = fncdef PFX(fname ## _32x8_ ## cpu); \
     p.chromaX265_CSP_I420.puCHROMA_420_8x32.prim  = fncdef PFX(fname ## _8x32_ ## cpu)

 
@@ -39,15 +39,9 @@
     p.cuBLOCK_16x16.prim = fncdef PFX(fname ## _16x16_ ## cpu); \
     p.cuBLOCK_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
     p.cuBLOCK_64x64.prim = fncdef PFX(fname ## _64x64_ ## cpu)
-#define LUMA_TU_TYPED_NEON(prim, fncdef, fname) \
-    p.cuBLOCK_4x4.prim   = fncdef PFX(fname ## _4x4_ ## neon); \
-    p.cuBLOCK_8x8.prim   = fncdef PFX(fname ## _8x8_ ## neon); \
-    p.cuBLOCK_16x16.prim = fncdef PFX(fname ## _16x16_ ## neon); \
-    p.cuBLOCK_64x64.prim = fncdef PFX(fname ## _64x64_ ## neon)
 #define LUMA_TU_TYPED_CAN_USE_SVE(prim, fncdef, fname) \
     p.cuBLOCK_32x32.prim = fncdef PFX(fname ## _32x32_ ## sve)
 #define ALL_LUMA_TU(prim, fname, cpu)      ALL_LUMA_TU_TYPED(prim, , fname, cpu)
-#define LUMA_TU_NEON(prim, fname)      LUMA_TU_TYPED_NEON(prim, , fname)
 #define LUMA_TU_CAN_USE_SVE(prim, fname)      LUMA_TU_TYPED_CAN_USE_SVE(prim, , fname)
 
 #define ALL_LUMA_PU_TYPED(prim, fncdef, fname, cpu) \
@@ -76,50 +70,6 @@
     p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## cpu); \
     p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## cpu); \
     p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## cpu)
-#define LUMA_PU_TYPED_MULTIPLE_ARCHS_1(prim, fncdef, fname, cpu) \
-    p.puLUMA_4x4.prim   = fncdef PFX(fname ## _4x4_ ## cpu); \
-    p.puLUMA_4x8.prim   = fncdef PFX(fname ## _4x8_ ## cpu); \
-    p.puLUMA_4x16.prim  = fncdef PFX(fname ## _4x16_ ## cpu)
-#define LUMA_PU_TYPED_MULTIPLE_ARCHS_2(prim, fncdef, fname, cpu) \
-    p.puLUMA_8x8.prim   = fncdef PFX(fname ## _8x8_ ## cpu); \
-    p.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## cpu); \
-    p.puLUMA_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
-    p.puLUMA_64x64.prim = fncdef PFX(fname ## _64x64_ ## cpu); \
-    p.puLUMA_8x4.prim   = fncdef PFX(fname ## _8x4_ ## cpu); \
-    p.puLUMA_16x8.prim  = fncdef PFX(fname ## _16x8_ ## cpu); \
-    p.puLUMA_8x16.prim  = fncdef PFX(fname ## _8x16_ ## cpu); \
-    p.puLUMA_16x32.prim = fncdef PFX(fname ## _16x32_ ## cpu); \
-    p.puLUMA_32x16.prim = fncdef PFX(fname ## _32x16_ ## cpu); \
-    p.puLUMA_64x32.prim = fncdef PFX(fname ## _64x32_ ## cpu); \
-    p.puLUMA_32x64.prim = fncdef PFX(fname ## _32x64_ ## cpu); \
-    p.puLUMA_16x12.prim = fncdef PFX(fname ## _16x12_ ## cpu); \
-    p.puLUMA_12x16.prim = fncdef PFX(fname ## _12x16_ ## cpu); \
-    p.puLUMA_16x4.prim  = fncdef PFX(fname ## _16x4_ ## cpu); \
-    p.puLUMA_32x24.prim = fncdef PFX(fname ## _32x24_ ## cpu); \
-    p.puLUMA_24x32.prim = fncdef PFX(fname ## _24x32_ ## cpu); \
-    p.puLUMA_32x8.prim  = fncdef PFX(fname ## _32x8_ ## cpu); \
-    p.puLUMA_8x32.prim  = fncdef PFX(fname ## _8x32_ ## cpu); \
-    p.puLUMA_64x48.prim = fncdef PFX(fname ## _64x48_ ## cpu); \
-    p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## cpu); \
-    p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## cpu); \
-    p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## cpu)
-#define LUMA_PU_TYPED_NEON_1(prim, fncdef, fname) \
-    p.puLUMA_4x4.prim   = fncdef PFX(fname ## _4x4_ ## neon); \
-    p.puLUMA_4x8.prim   = fncdef PFX(fname ## _4x8_ ## neon); \
-    p.puLUMA_4x16.prim  = fncdef PFX(fname ## _4x16_ ## neon); \
-    p.puLUMA_12x16.prim = fncdef PFX(fname ## _12x16_ ## neon); \
-    p.puLUMA_8x8.prim   = fncdef PFX(fname ## _8x8_ ## neon); \
-    p.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## neon); \
-    p.puLUMA_8x4.prim   = fncdef PFX(fname ## _8x4_ ## neon); \
-    p.puLUMA_16x8.prim  = fncdef PFX(fname ## _16x8_ ## neon); \
-    p.puLUMA_8x16.prim  = fncdef PFX(fname ## _8x16_ ## neon); \
-    p.puLUMA_16x12.prim = fncdef PFX(fname ## _16x12_ ## neon); \
-    p.puLUMA_16x32.prim = fncdef PFX(fname ## _16x32_ ## neon); \
-    p.puLUMA_16x4.prim  = fncdef PFX(fname ## _16x4_ ## neon); \
-    p.puLUMA_24x32.prim = fncdef PFX(fname ## _24x32_ ## neon); \
-    p.puLUMA_8x32.prim  = fncdef PFX(fname ## _8x32_ ## neon); \
-    p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## neon); \
-    p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## neon)
 #define LUMA_PU_TYPED_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(prim, fncdef, fname) \
     p.puLUMA_32x32.prim = fncdef PFX(fname ## _32x32_ ## sve); \
     p.puLUMA_64x64.prim = fncdef PFX(fname ## _64x64_ ## sve); \
@@ -130,20 +80,6 @@
     p.puLUMA_32x8.prim  = fncdef PFX(fname ## _32x8_ ## sve); \
     p.puLUMA_64x48.prim = fncdef PFX(fname ## _64x48_ ## sve); \
     p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## sve)
-#define LUMA_PU_TYPED_NEON_2(prim, fncdef, fname) \
-    p.puLUMA_4x4.prim   = fncdef PFX(fname ## _4x4_ ## neon); \
-    p.puLUMA_8x4.prim   = fncdef PFX(fname ## _8x4_ ## neon); \
-    p.puLUMA_4x8.prim   = fncdef PFX(fname ## _4x8_ ## neon); \
-    p.puLUMA_8x8.prim   = fncdef PFX(fname ## _8x8_ ## neon); \
-    p.puLUMA_16x8.prim  = fncdef PFX(fname ## _16x8_ ## neon); \
-    p.puLUMA_8x16.prim  = fncdef PFX(fname ## _8x16_ ## neon); \
-    p.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## neon); \
-    p.puLUMA_16x32.prim = fncdef PFX(fname ## _16x32_ ## neon); \
-    p.puLUMA_16x12.prim = fncdef PFX(fname ## _16x12_ ## neon); \
-    p.puLUMA_16x4.prim  = fncdef PFX(fname ## _16x4_ ## neon); \
-    p.puLUMA_4x16.prim  = fncdef PFX(fname ## _4x16_ ## neon); \
-    p.puLUMA_8x32.prim  = fncdef PFX(fname ## _8x32_ ## neon); \
-    p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## neon)
 #define LUMA_PU_TYPED_MULTIPLE_ARCHS_3(prim, fncdef, fname, cpu) \
     p.puLUMA_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
     p.puLUMA_64x64.prim = fncdef PFX(fname ## _64x64_ ## cpu); \
@@ -157,10 +93,6 @@
     p.puLUMA_64x48.prim = fncdef PFX(fname ## _64x48_ ## cpu); \
     p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## cpu); \
     p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## cpu)
-#define LUMA_PU_TYPED_NEON_3(prim, fncdef, fname) \
-    p.puLUMA_4x4.prim   = fncdef PFX(fname ## _4x4_ ## neon); \
-    p.puLUMA_4x8.prim   = fncdef PFX(fname ## _4x8_ ## neon); \
-    p.puLUMA_4x16.prim  = fncdef PFX(fname ## _4x16_ ## neon)
 #define LUMA_PU_TYPED_CAN_USE_SVE2(prim, fncdef, fname) \
     p.puLUMA_8x8.prim   = fncdef PFX(fname ## _8x8_ ## sve2); \
     p.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## sve2); \
@@ -184,22 +116,6 @@
     p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## sve2); \
     p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## sve2); \
     p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## sve2)
-#define LUMA_PU_TYPED_NEON_FILTER_PIXEL_TO_SHORT(prim, fncdef) \
-    p.puLUMA_4x4.prim   = fncdef PFX(filterPixelToShort ## _4x4_ ## neon); \
-    p.puLUMA_8x8.prim   = fncdef PFX(filterPixelToShort ## _8x8_ ## neon); \
-    p.puLUMA_16x16.prim = fncdef PFX(filterPixelToShort ## _16x16_ ## neon); \
-    p.puLUMA_8x4.prim   = fncdef PFX(filterPixelToShort ## _8x4_ ## neon); \
-    p.puLUMA_4x8.prim   = fncdef PFX(filterPixelToShort ## _4x8_ ## neon); \
-    p.puLUMA_16x8.prim  = fncdef PFX(filterPixelToShort ## _16x8_ ## neon); \
-    p.puLUMA_8x16.prim  = fncdef PFX(filterPixelToShort ## _8x16_ ## neon); \
-    p.puLUMA_16x32.prim = fncdef PFX(filterPixelToShort ## _16x32_ ## neon); \
-    p.puLUMA_16x12.prim = fncdef PFX(filterPixelToShort ## _16x12_ ## neon); \
-    p.puLUMA_12x16.prim = fncdef PFX(filterPixelToShort ## _12x16_ ## neon); \
-    p.puLUMA_16x4.prim  = fncdef PFX(filterPixelToShort ## _16x4_ ## neon); \
-    p.puLUMA_4x16.prim  = fncdef PFX(filterPixelToShort ## _4x16_ ## neon); \
-    p.puLUMA_24x32.prim = fncdef PFX(filterPixelToShort ## _24x32_ ## neon); \
-    p.puLUMA_8x32.prim  = fncdef PFX(filterPixelToShort ## _8x32_ ## neon); \
-    p.puLUMA_16x64.prim = fncdef PFX(filterPixelToShort ## _16x64_ ## neon)
 #define LUMA_PU_TYPED_SVE_FILTER_PIXEL_TO_SHORT(prim, fncdef) \
     p.puLUMA_32x32.prim = fncdef PFX(filterPixelToShort ## _32x32_ ## sve); \
     p.puLUMA_32x16.prim = fncdef PFX(filterPixelToShort ## _32x16_ ## sve); \
@@ -211,17 +127,29 @@
     p.puLUMA_64x48.prim = fncdef PFX(filterPixelToShort ## _64x48_ ## sve); \
     p.puLUMA_64x16.prim = fncdef PFX(filterPixelToShort ## _64x16_ ## sve); \
     p.puLUMA_48x64.prim = fncdef PFX(filterPixelToShort ## _48x64_ ## sve)
+#define LUMA_PU_TYPED_MULTIPLE_16(prim, fncdef, fname, cpu)      \
+    p.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## cpu); \
+    p.puLUMA_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
+    p.puLUMA_64x64.prim = fncdef PFX(fname ## _64x64_ ## cpu); \
+    p.puLUMA_16x8.prim  = fncdef PFX(fname ## _16x8_ ## cpu);  \
+    p.puLUMA_16x32.prim = fncdef PFX(fname ## _16x32_ ## cpu); \
+    p.puLUMA_32x16.prim = fncdef PFX(fname ## _32x16_ ## cpu); \
+    p.puLUMA_64x32.prim = fncdef PFX(fname ## _64x32_ ## cpu); \
+    p.puLUMA_32x64.prim = fncdef PFX(fname ## _32x64_ ## cpu); \
+    p.puLUMA_16x12.prim = fncdef PFX(fname ## _16x12_ ## cpu); \
+    p.puLUMA_16x4.prim  = fncdef PFX(fname ## _16x4_ ## cpu);  \
+    p.puLUMA_32x24.prim = fncdef PFX(fname ## _32x24_ ## cpu); \
+    p.puLUMA_32x8.prim  = fncdef PFX(fname ## _32x8_ ## cpu);  \
+    p.puLUMA_64x48.prim = fncdef PFX(fname ## _64x48_ ## cpu); \
+    p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## cpu); \
+    p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## cpu); \
+    p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## cpu)
 #define ALL_LUMA_PU(prim, fname, cpu) ALL_LUMA_PU_TYPED(prim, , fname, cpu)
-#define LUMA_PU_MULTIPLE_ARCHS_1(prim, fname, cpu) LUMA_PU_TYPED_MULTIPLE_ARCHS_1(prim, , fname, cpu)
-#define LUMA_PU_MULTIPLE_ARCHS_2(prim, fname, cpu) LUMA_PU_TYPED_MULTIPLE_ARCHS_2(prim, , fname, cpu)
-#define LUMA_PU_NEON_1(prim, fname) LUMA_PU_TYPED_NEON_1(prim, , fname)
 #define LUMA_PU_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(prim, fname) LUMA_PU_TYPED_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(prim, , fname)
-#define LUMA_PU_NEON_2(prim, fname) LUMA_PU_TYPED_NEON_2(prim, , fname)
 #define LUMA_PU_MULTIPLE_ARCHS_3(prim, fname, cpu) LUMA_PU_TYPED_MULTIPLE_ARCHS_3(prim, , fname, cpu)
-#define LUMA_PU_NEON_3(prim, fname) LUMA_PU_TYPED_NEON_3(prim, , fname)
 #define LUMA_PU_CAN_USE_SVE2(prim, fname) LUMA_PU_TYPED_CAN_USE_SVE2(prim, , fname)
-#define LUMA_PU_NEON_FILTER_PIXEL_TO_SHORT(prim) LUMA_PU_TYPED_NEON_FILTER_PIXEL_TO_SHORT(prim, )
 #define LUMA_PU_SVE_FILTER_PIXEL_TO_SHORT(prim) LUMA_PU_TYPED_SVE_FILTER_PIXEL_TO_SHORT(prim, )
+#define LUMA_PU_MULTIPLE_16(prim, fname, cpu) LUMA_PU_TYPED_MULTIPLE_16(prim, , fname, cpu)
 
 
 #define ALL_LUMA_PU_T(prim, fname) \
@@ -276,37 +204,9 @@
     p.chromaX265_CSP_I420.puCHROMA_420_24x32.prim = fncdef PFX(fname ## _24x32_ ## cpu); \
     p.chromaX265_CSP_I420.puCHROMA_420_32x8.prim  = fncdef PFX(fname ## _32x8_ ## cpu); \
     p.chromaX265_CSP_I420.puCHROMA_420_8x32.prim  = fncdef PFX(fname ## _8x32_ ## cpu)
-#define CHROMA_420_PU_TYPED_NEON_1(prim, fncdef, fname)               \
-    p.chromaX265_CSP_I420.puCHROMA_420_4x4.prim   = fncdef PFX(fname ## _4x4_ ## neon); \
-    p.chromaX265_CSP_I420.puCHROMA_420_4x2.prim   = fncdef PFX(fname ## _4x2_ ## neon); \
-    p.chromaX265_CSP_I420.puCHROMA_420_4x8.prim   = fncdef PFX(fname ## _4x8_ ## neon); \
-    p.chromaX265_CSP_I420.puCHROMA_420_6x8.prim   = fncdef PFX(fname ## _6x8_ ## neon); \
-    p.chromaX265_CSP_I420.puCHROMA_420_12x16.prim = fncdef PFX(fname ## _12x16_ ## neon); \
-    p.chromaX265_CSP_I420.puCHROMA_420_4x16.prim  = fncdef PFX(fname ## _4x16_ ## neon); \
-    p.chromaX265_CSP_I420.puCHROMA_420_32x24.prim = fncdef PFX(fname ## _32x24_ ## neon); \
-    p.chromaX265_CSP_I420.puCHROMA_420_24x32.prim = fncdef PFX(fname ## _24x32_ ## neon); \
-    p.chromaX265_CSP_I420.puCHROMA_420_32x8.prim  = fncdef PFX(fname ## _32x8_ ## neon); \
-    p.chromaX265_CSP_I420.puCHROMA_420_8x32.prim  = fncdef PFX(fname ## _8x32_ ## neon); \
-    p.chromaX265_CSP_I420.puCHROMA_420_8x8.prim   = fncdef PFX(fname ## _8x8_ ## neon); \
-    p.chromaX265_CSP_I420.puCHROMA_420_16x16.prim = fncdef PFX(fname ## _16x16_ ## neon); \
-    p.chromaX265_CSP_I420.puCHROMA_420_2x4.prim   = fncdef PFX(fname ## _2x4_ ## neon); \
-    p.chromaX265_CSP_I420.puCHROMA_420_8x4.prim   = fncdef PFX(fname ## _8x4_ ## neon); \
-    p.chromaX265_CSP_I420.puCHROMA_420_16x8.prim  = fncdef PFX(fname ## _16x8_ ## neon); \
-    p.chromaX265_CSP_I420.puCHROMA_420_8x16.prim  = fncdef PFX(fname ## _8x16_ ## neon); \
-    p.chromaX265_CSP_I420.puCHROMA_420_16x32.prim = fncdef PFX(fname ## _16x32_ ## neon); \
-    p.chromaX265_CSP_I420.puCHROMA_420_8x6.prim   = fncdef PFX(fname ## _8x6_ ## neon); \
-    p.chromaX265_CSP_I420.puCHROMA_420_8x2.prim   = fncdef PFX(fname ## _8x2_ ## neon); \
-    p.chromaX265_CSP_I420.puCHROMA_420_2x8.prim   = fncdef PFX(fname ## _2x8_ ## neon); \
-    p.chromaX265_CSP_I420.puCHROMA_420_16x12.prim = fncdef PFX(fname ## _16x12_ ## neon); \
-    p.chromaX265_CSP_I420.puCHROMA_420_16x4.prim  = fncdef PFX(fname ## _16x4_ ## neon)
 #define CHROMA_420_PU_TYPED_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(prim, fncdef, fname)               \
     p.chromaX265_CSP_I420.puCHROMA_420_32x32.prim = fncdef PFX(fname ## _32x32_ ## sve); \
     p.chromaX265_CSP_I420.puCHROMA_420_32x16.prim = fncdef PFX(fname ## _32x16_ ## sve)
-#define CHROMA_420_PU_TYPED_NEON_2(prim, fncdef, fname)               \
-    p.chromaX265_CSP_I420.puCHROMA_420_4x4.prim   = fncdef PFX(fname ## _4x4_ ## neon); \
-    p.chromaX265_CSP_I420.puCHROMA_420_4x2.prim   = fncdef PFX(fname ## _4x2_ ## neon); \
-    p.chromaX265_CSP_I420.puCHROMA_420_4x8.prim   = fncdef PFX(fname ## _4x8_ ## neon); \
-    p.chromaX265_CSP_I420.puCHROMA_420_4x16.prim  = fncdef PFX(fname ## _4x16_ ## neon)
 #define CHROMA_420_PU_TYPED_MULTIPLE_ARCHS(prim, fncdef, fname, cpu)               \
     p.chromaX265_CSP_I420.puCHROMA_420_8x8.prim   = fncdef PFX(fname ## _8x8_ ## cpu); \
     p.chromaX265_CSP_I420.puCHROMA_420_16x16.prim = fncdef PFX(fname ## _16x16_ ## cpu); \
@@ -328,23 +228,6 @@
     p.chromaX265_CSP_I420.puCHROMA_420_24x32.prim = fncdef PFX(fname ## _24x32_ ## cpu); \
     p.chromaX265_CSP_I420.puCHROMA_420_32x8.prim  = fncdef PFX(fname ## _32x8_ ## cpu); \
     p.chromaX265_CSP_I420.puCHROMA_420_8x32.prim  = fncdef PFX(fname ## _8x32_ ## cpu)
​

x265_3.6.tar.gz/source/common/aarch64/asm.S -> x265_4.0.tar.gz/source/common/aarch64/asm.S Changed

 
@@ -72,6 +72,16 @@
 
 #define PFX_C(name)        JOIN(JOIN(JOIN(EXTERN_ASM, X265_NS), _), name)
 
+// Alignment of stack arguments of size less than 8 bytes.
+#ifdef __APPLE__
+#define STACK_ARG_ALIGNMENT 4
+#else
+#define STACK_ARG_ALIGNMENT 8
+#endif
+
+// Get offset from SP of stack argument at index `idx`.
+#define STACK_ARG_OFFSET(idx) (idx * STACK_ARG_ALIGNMENT)
+
 #ifdef __APPLE__
 .macro endfunc
 ELF .size \name, . - \name
@@ -184,4 +194,19 @@
     vtrn            \t3, \t4, \s3, \s4
 .endm
 
-#endif
\ No newline at end of file
+
+.macro push_vec_regs
+    stp             d8, d9, sp,#-16!
+    stp             d10, d11, sp,#-16!
+    stp             d12, d13, sp,#-16!
+    stp             d14, d15, sp,#-16!
+.endm
+
+.macro pop_vec_regs
+    ldp             d14, d15, sp, #16
+    ldp             d12, d13, sp, #16
+    ldp             d10, d11, sp, #16
+    ldp             d8, d9, sp, #16
+.endm
+
+#endif
​

x265_3.6.tar.gz/source/common/aarch64/blockcopy8-sve.S -> x265_4.0.tar.gz/source/common/aarch64/blockcopy8-sve.S Changed

@@ -112,7 +112,7 @@
     lsl             x3, x3, #1
     movrel          x11, xtn_xtn2_table
     ld1             {v31.16b}, x11
-.loop_csp32_sve:
+.Loop_csp32_sve:
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.8h-v3.8h}, x2, x3
@@ -124,7 +124,7 @@
     st1             {v0.16b-v1.16b}, x0, x1
     st1             {v2.16b-v3.16b}, x0, x1
 .endr
-    cbnz            w12, .loop_csp32_sve
+    cbnz            w12, .Loop_csp32_sve
     ret
 .vl_gt_16_blockcopy_sp_32_32:
     cmp             x9, #48
@@ -199,7 +199,7 @@
     bgt             .vl_gt_16_blockcopy_ps_32_32
     lsl             x1, x1, #1
     mov             w12, #4
-.loop_cps32_sve:
+.Loop_cps32_sve:
     sub             w12, w12, #1
 .rept 4
     ld1             {v16.16b-v17.16b}, x2, x3
@@ -215,7 +215,7 @@
     st1             {v0.8h-v3.8h}, x0, x1
     st1             {v4.8h-v7.8h}, x0, x1
 .endr
-    cbnz            w12, .loop_cps32_sve
+    cbnz            w12, .Loop_cps32_sve
     ret
 .vl_gt_16_blockcopy_ps_32_32:
     cmp             x9, #48
@@ -248,7 +248,7 @@
     lsl             x1, x1, #1
     sub             x1, x1, #64
     mov             w12, #16
-.loop_cps64_sve:
+.Loop_cps64_sve:
     sub             w12, w12, #1
 .rept 4
     ld1             {v16.16b-v19.16b}, x2, x3
@@ -263,7 +263,7 @@
     st1             {v0.8h-v3.8h}, x0, #64
     st1             {v4.8h-v7.8h}, x0, x1
 .endr
-    cbnz            w12, .loop_cps64_sve
+    cbnz            w12, .Loop_cps64_sve
     ret
 .vl_gt_16_blockcopy_ps_64_64:
     cmp             x9, #48
@@ -338,13 +338,13 @@
     lsl             x1, x1, #1
     lsl             x3, x3, #1
     mov             w12, #4
-.loop_css32_sve:
+.Loop_css32_sve:
     sub             w12, w12, #1
 .rept 8
     ld1             {v0.8h-v3.8h}, x2, x3
     st1             {v0.8h-v3.8h}, x0, x1
 .endr
-    cbnz            w12, .loop_css32_sve
+    cbnz            w12, .Loop_css32_sve
     ret
 .vl_gt_16_blockcopy_ss_32_32:
     cmp             x9, #48
@@ -379,7 +379,7 @@
     lsl             x3, x3, #1
     sub             x3, x3, #64
     mov             w12, #8
-.loop_css64_sve:
+.Loop_css64_sve:
     sub             w12, w12, #1
 .rept 8
     ld1             {v0.8h-v3.8h}, x2, #64
@@ -387,7 +387,7 @@
     st1             {v0.8h-v3.8h}, x0, #64
     st1             {v4.8h-v7.8h}, x0, x1
 .endr
-    cbnz            w12, .loop_css64_sve
+    cbnz            w12, .Loop_css64_sve
     ret
 .vl_gt_16_blockcopy_ss_64_64:
     cmp             x9, #48
@@ -474,13 +474,13 @@
     lsl             x1, x1, #1
     lsl             x3, x3, #1
     mov             w12, #8
-.loop_css32x64_sve:
+.Loop_css32x64_sve:
     sub             w12, w12, #1
 .rept 8
     ld1             {v0.8h-v3.8h}, x2, x3
     st1             {v0.8h-v3.8h}, x0, x1
 .endr
-    cbnz            w12, .loop_css32x64_sve
+    cbnz            w12, .Loop_css32x64_sve
     ret
 .vl_gt_16_blockcopy_ss_32_64:
     cmp             x9, #48
@@ -570,7 +570,7 @@
     bgt             .vl_gt_16_blockcopy_ps_32_64
     lsl             x1, x1, #1
     mov             w12, #8
-.loop_cps32x64_sve:
+.Loop_cps32x64_sve:
     sub             w12, w12, #1
 .rept 4
     ld1             {v16.16b-v17.16b}, x2, x3
@@ -586,7 +586,7 @@
     st1             {v0.8h-v3.8h}, x0, x1
     st1             {v4.8h-v7.8h}, x0, x1
 .endr
-    cbnz            w12, .loop_cps32x64_sve
+    cbnz            w12, .Loop_cps32x64_sve
     ret
 .vl_gt_16_blockcopy_ps_32_64:
     cmp             x9, #48
@@ -730,13 +730,13 @@
     rdvl            x9, #1
     cmp             x9, #16
     bgt             .vl_gt_16_blockcopy_pp_32xN_\h
-.loop_sve_32x\h\():
+.Loop_sve_32x\h\():
     sub             w12, w12, #1
 .rept 8
     ld1             {v0.16b-v1.16b}, x2, x3
     st1             {v0.16b-v1.16b}, x0, x1
 .endr
-    cbnz            w12, .loop_sve_32x\h
+    cbnz            w12, .Loop_sve_32x\h
     ret
 .vl_gt_16_blockcopy_pp_32xN_\h:
     ptrue           p0.b, vl32
@@ -765,13 +765,13 @@
     rdvl            x9, #1
     cmp             x9, #16
     bgt             .vl_gt_16_blockcopy_pp_64xN_\h
-.loop_sve_64x\h\():
+.Loop_sve_64x\h\():
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.16b-v3.16b}, x2, x3
     st1             {v0.16b-v3.16b}, x0, x1
 .endr
-    cbnz            w12, .loop_sve_64x\h
+    cbnz            w12, .Loop_sve_64x\h
     ret
 .vl_gt_16_blockcopy_pp_64xN_\h:
     cmp             x9, #48
@@ -856,7 +856,7 @@
     bgt             .vl_gt_16_cpy2Dto1D_shl_16x16
     cpy2Dto1D_shl_start_sve
     mov             w12, #4
-.loop_cpy2Dto1D_shl_16_sve:
+.Loop_cpy2Dto1D_shl_16_sve:
     sub             w12, w12, #1
 .rept 4
     ld1             {v2.16b-v3.16b}, x1, x2
@@ -864,7 +864,7 @@
     sshl            v3.8h, v3.8h, v0.8h
     st1             {v2.16b-v3.16b}, x0, #32
 .endr
-    cbnz            w12, .loop_cpy2Dto1D_shl_16_sve
+    cbnz            w12, .Loop_cpy2Dto1D_shl_16_sve
     ret
 .vl_gt_16_cpy2Dto1D_shl_16x16:
     ptrue           p0.h, vl16
@@ -885,7 +885,7 @@
     bgt             .vl_gt_16_cpy2Dto1D_shl_32x32
     cpy2Dto1D_shl_start_sve
     mov             w12, #16
-.loop_cpy2Dto1D_shl_32_sve:
+.Loop_cpy2Dto1D_shl_32_sve:
     sub             w12, w12, #1
 .rept 2
     ld1             {v2.16b-v5.16b}, x1, x2
@@ -895,7 +895,7 @@
     sshl            v5.8h, v5.8h, v0.8h
     st1             {v2.16b-v5.16b}, x0, #64
 .endr
-    cbnz            w12, .loop_cpy2Dto1D_shl_32_sve
+    cbnz            w12, .Loop_cpy2Dto1D_shl_32_sve
     ret
 .vl_gt_16_cpy2Dto1D_shl_32x32:
     cmp             x9, #48
@@ -931,7 +931,7 @@
     cpy2Dto1D_shl_start_sve
     mov             w12, #32
     sub             x2, x2, #64
-.loop_cpy2Dto1D_shl_64_sve:
+.Loop_cpy2Dto1D_shl_64_sve:
     sub             w12, w12, #1
 .rept 2
     ld1             {v2.16b-v5.16b}, x1, #64
@@ -947,7 +947,7 @@

 
@@ -112,7 +112,7 @@
     lsl             x3, x3, #1
     movrel          x11, xtn_xtn2_table
     ld1             {v31.16b}, x11
-.loop_csp32_sve:
+.Loop_csp32_sve:
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.8h-v3.8h}, x2, x3
@@ -124,7 +124,7 @@
     st1             {v0.16b-v1.16b}, x0, x1
     st1             {v2.16b-v3.16b}, x0, x1
 .endr
-    cbnz            w12, .loop_csp32_sve
+    cbnz            w12, .Loop_csp32_sve
     ret
 .vl_gt_16_blockcopy_sp_32_32:
     cmp             x9, #48
@@ -199,7 +199,7 @@
     bgt             .vl_gt_16_blockcopy_ps_32_32
     lsl             x1, x1, #1
     mov             w12, #4
-.loop_cps32_sve:
+.Loop_cps32_sve:
     sub             w12, w12, #1
 .rept 4
     ld1             {v16.16b-v17.16b}, x2, x3
@@ -215,7 +215,7 @@
     st1             {v0.8h-v3.8h}, x0, x1
     st1             {v4.8h-v7.8h}, x0, x1
 .endr
-    cbnz            w12, .loop_cps32_sve
+    cbnz            w12, .Loop_cps32_sve
     ret
 .vl_gt_16_blockcopy_ps_32_32:
     cmp             x9, #48
@@ -248,7 +248,7 @@
     lsl             x1, x1, #1
     sub             x1, x1, #64
     mov             w12, #16
-.loop_cps64_sve:
+.Loop_cps64_sve:
     sub             w12, w12, #1
 .rept 4
     ld1             {v16.16b-v19.16b}, x2, x3
@@ -263,7 +263,7 @@
     st1             {v0.8h-v3.8h}, x0, #64
     st1             {v4.8h-v7.8h}, x0, x1
 .endr
-    cbnz            w12, .loop_cps64_sve
+    cbnz            w12, .Loop_cps64_sve
     ret
 .vl_gt_16_blockcopy_ps_64_64:
     cmp             x9, #48
@@ -338,13 +338,13 @@
     lsl             x1, x1, #1
     lsl             x3, x3, #1
     mov             w12, #4
-.loop_css32_sve:
+.Loop_css32_sve:
     sub             w12, w12, #1
 .rept 8
     ld1             {v0.8h-v3.8h}, x2, x3
     st1             {v0.8h-v3.8h}, x0, x1
 .endr
-    cbnz            w12, .loop_css32_sve
+    cbnz            w12, .Loop_css32_sve
     ret
 .vl_gt_16_blockcopy_ss_32_32:
     cmp             x9, #48
@@ -379,7 +379,7 @@
     lsl             x3, x3, #1
     sub             x3, x3, #64
     mov             w12, #8
-.loop_css64_sve:
+.Loop_css64_sve:
     sub             w12, w12, #1
 .rept 8
     ld1             {v0.8h-v3.8h}, x2, #64
@@ -387,7 +387,7 @@
     st1             {v0.8h-v3.8h}, x0, #64
     st1             {v4.8h-v7.8h}, x0, x1
 .endr
-    cbnz            w12, .loop_css64_sve
+    cbnz            w12, .Loop_css64_sve
     ret
 .vl_gt_16_blockcopy_ss_64_64:
     cmp             x9, #48
@@ -474,13 +474,13 @@
     lsl             x1, x1, #1
     lsl             x3, x3, #1
     mov             w12, #8
-.loop_css32x64_sve:
+.Loop_css32x64_sve:
     sub             w12, w12, #1
 .rept 8
     ld1             {v0.8h-v3.8h}, x2, x3
     st1             {v0.8h-v3.8h}, x0, x1
 .endr
-    cbnz            w12, .loop_css32x64_sve
+    cbnz            w12, .Loop_css32x64_sve
     ret
 .vl_gt_16_blockcopy_ss_32_64:
     cmp             x9, #48
@@ -570,7 +570,7 @@
     bgt             .vl_gt_16_blockcopy_ps_32_64
     lsl             x1, x1, #1
     mov             w12, #8
-.loop_cps32x64_sve:
+.Loop_cps32x64_sve:
     sub             w12, w12, #1
 .rept 4
     ld1             {v16.16b-v17.16b}, x2, x3
@@ -586,7 +586,7 @@
     st1             {v0.8h-v3.8h}, x0, x1
     st1             {v4.8h-v7.8h}, x0, x1
 .endr
-    cbnz            w12, .loop_cps32x64_sve
+    cbnz            w12, .Loop_cps32x64_sve
     ret
 .vl_gt_16_blockcopy_ps_32_64:
     cmp             x9, #48
@@ -730,13 +730,13 @@
     rdvl            x9, #1
     cmp             x9, #16
     bgt             .vl_gt_16_blockcopy_pp_32xN_\h
-.loop_sve_32x\h\():
+.Loop_sve_32x\h\():
     sub             w12, w12, #1
 .rept 8
     ld1             {v0.16b-v1.16b}, x2, x3
     st1             {v0.16b-v1.16b}, x0, x1
 .endr
-    cbnz            w12, .loop_sve_32x\h
+    cbnz            w12, .Loop_sve_32x\h
     ret
 .vl_gt_16_blockcopy_pp_32xN_\h:
     ptrue           p0.b, vl32
@@ -765,13 +765,13 @@
     rdvl            x9, #1
     cmp             x9, #16
     bgt             .vl_gt_16_blockcopy_pp_64xN_\h
-.loop_sve_64x\h\():
+.Loop_sve_64x\h\():
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.16b-v3.16b}, x2, x3
     st1             {v0.16b-v3.16b}, x0, x1
 .endr
-    cbnz            w12, .loop_sve_64x\h
+    cbnz            w12, .Loop_sve_64x\h
     ret
 .vl_gt_16_blockcopy_pp_64xN_\h:
     cmp             x9, #48
@@ -856,7 +856,7 @@
     bgt             .vl_gt_16_cpy2Dto1D_shl_16x16
     cpy2Dto1D_shl_start_sve
     mov             w12, #4
-.loop_cpy2Dto1D_shl_16_sve:
+.Loop_cpy2Dto1D_shl_16_sve:
     sub             w12, w12, #1
 .rept 4
     ld1             {v2.16b-v3.16b}, x1, x2
@@ -864,7 +864,7 @@
     sshl            v3.8h, v3.8h, v0.8h
     st1             {v2.16b-v3.16b}, x0, #32
 .endr
-    cbnz            w12, .loop_cpy2Dto1D_shl_16_sve
+    cbnz            w12, .Loop_cpy2Dto1D_shl_16_sve
     ret
 .vl_gt_16_cpy2Dto1D_shl_16x16:
     ptrue           p0.h, vl16
@@ -885,7 +885,7 @@
     bgt             .vl_gt_16_cpy2Dto1D_shl_32x32
     cpy2Dto1D_shl_start_sve
     mov             w12, #16
-.loop_cpy2Dto1D_shl_32_sve:
+.Loop_cpy2Dto1D_shl_32_sve:
     sub             w12, w12, #1
 .rept 2
     ld1             {v2.16b-v5.16b}, x1, x2
@@ -895,7 +895,7 @@
     sshl            v5.8h, v5.8h, v0.8h
     st1             {v2.16b-v5.16b}, x0, #64
 .endr
-    cbnz            w12, .loop_cpy2Dto1D_shl_32_sve
+    cbnz            w12, .Loop_cpy2Dto1D_shl_32_sve
     ret
 .vl_gt_16_cpy2Dto1D_shl_32x32:
     cmp             x9, #48
@@ -931,7 +931,7 @@
     cpy2Dto1D_shl_start_sve
     mov             w12, #32
     sub             x2, x2, #64
-.loop_cpy2Dto1D_shl_64_sve:
+.Loop_cpy2Dto1D_shl_64_sve:
     sub             w12, w12, #1
 .rept 2
     ld1             {v2.16b-v5.16b}, x1, #64
@@ -947,7 +947,7 @@
​

x265_3.6.tar.gz/source/common/aarch64/blockcopy8.S -> x265_4.0.tar.gz/source/common/aarch64/blockcopy8.S Changed

@@ -86,7 +86,7 @@
     lsl             x3, x3, #1
     movrel          x11, xtn_xtn2_table
     ld1             {v31.16b}, x11
-.loop_csp32:
+.Loop_csp32:
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.8h-v3.8h}, x2, x3
@@ -98,7 +98,7 @@
     st1             {v0.16b-v1.16b}, x0, x1
     st1             {v2.16b-v3.16b}, x0, x1
 .endr
-    cbnz            w12, .loop_csp32
+    cbnz            w12, .Loop_csp32
     ret
 endfunc
 
@@ -108,7 +108,7 @@
     sub             x3, x3, #64
     movrel          x11, xtn_xtn2_table
     ld1             {v31.16b}, x11
-.loop_csp64:
+.Loop_csp64:
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.8h-v3.8h}, x2, #64
@@ -119,7 +119,7 @@
     tbl             v3.16b, {v6.16b,v7.16b}, v31.16b
     st1             {v0.16b-v3.16b}, x0, x1
 .endr
-    cbnz            w12, .loop_csp64
+    cbnz            w12, .Loop_csp64
     ret
 endfunc
 
@@ -168,7 +168,7 @@
 function PFX(blockcopy_ps_32x32_neon)
     lsl             x1, x1, #1
     mov             w12, #4
-.loop_cps32:
+.Loop_cps32:
     sub             w12, w12, #1
 .rept 4
     ld1             {v16.16b-v17.16b}, x2, x3
@@ -184,7 +184,7 @@
     st1             {v0.8h-v3.8h}, x0, x1
     st1             {v4.8h-v7.8h}, x0, x1
 .endr
-    cbnz            w12, .loop_cps32
+    cbnz            w12, .Loop_cps32
     ret
 endfunc
 
@@ -192,7 +192,7 @@
     lsl             x1, x1, #1
     sub             x1, x1, #64
     mov             w12, #16
-.loop_cps64:
+.Loop_cps64:
     sub             w12, w12, #1
 .rept 4
     ld1             {v16.16b-v19.16b}, x2, x3
@@ -207,7 +207,7 @@
     st1             {v0.8h-v3.8h}, x0, #64
     st1             {v4.8h-v7.8h}, x0, x1
 .endr
-    cbnz            w12, .loop_cps64
+    cbnz            w12, .Loop_cps64
     ret
 endfunc
 
@@ -252,13 +252,13 @@
     lsl             x1, x1, #1
     lsl             x3, x3, #1
     mov             w12, #4
-.loop_css32:
+.Loop_css32:
     sub             w12, w12, #1
 .rept 8
     ld1             {v0.8h-v3.8h}, x2, x3
     st1             {v0.8h-v3.8h}, x0, x1
 .endr
-    cbnz            w12, .loop_css32
+    cbnz            w12, .Loop_css32
     ret
 endfunc
 
@@ -268,7 +268,7 @@
     lsl             x3, x3, #1
     sub             x3, x3, #64
     mov             w12, #8
-.loop_css64:
+.Loop_css64:
     sub             w12, w12, #1
 .rept 8
     ld1             {v0.8h-v3.8h}, x2, #64
@@ -276,7 +276,7 @@
     st1             {v0.8h-v3.8h}, x0, #64
     st1             {v4.8h-v7.8h}, x0, x1
 .endr
-    cbnz            w12, .loop_css64
+    cbnz            w12, .Loop_css64
     ret
 endfunc
 
@@ -321,13 +321,13 @@
     lsl             x1, x1, #1
     lsl             x3, x3, #1
     mov             w12, #8
-.loop_css32x64:
+.Loop_css32x64:
     sub             w12, w12, #1
 .rept 8
     ld1             {v0.8h-v3.8h}, x2, x3
     st1             {v0.8h-v3.8h}, x0, x1
 .endr
-    cbnz            w12, .loop_css32x64
+    cbnz            w12, .Loop_css32x64
     ret
 endfunc
 
@@ -376,7 +376,7 @@
 function PFX(blockcopy_ps_32x64_neon)
     lsl             x1, x1, #1
     mov             w12, #8
-.loop_cps32x64:
+.Loop_cps32x64:
     sub             w12, w12, #1
 .rept 4
     ld1             {v16.16b-v17.16b}, x2, x3
@@ -392,7 +392,7 @@
     st1             {v0.8h-v3.8h}, x0, x1
     st1             {v4.8h-v7.8h}, x0, x1
 .endr
-    cbnz            w12, .loop_cps32x64
+    cbnz            w12, .Loop_cps32x64
     ret
 endfunc
 
@@ -443,7 +443,7 @@
     lsl             x3, x3, #1
     movrel          x11, xtn_xtn2_table
     ld1             {v31.16b}, x11
-.loop_csp32x64:
+.Loop_csp32x64:
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.8h-v3.8h}, x2, x3
@@ -455,7 +455,7 @@
     st1             {v0.16b-v1.16b}, x0, x1
     st1             {v2.16b-v3.16b}, x0, x1
 .endr
-    cbnz            w12, .loop_csp32x64
+    cbnz            w12, .Loop_csp32x64
     ret
 endfunc
 
@@ -595,13 +595,13 @@
 
 function PFX(blockcopy_pp_8x64_neon)
     mov             w12, #4
-.loop_pp_8x64:
+.Loop_pp_8x64:
     sub             w12, w12, #1
 .rept 16
     ld1             {v0.4h}, x2, x3
     st1             {v0.4h}, x0, x1
 .endr
-    cbnz            w12, .loop_pp_8x64
+    cbnz            w12, .Loop_pp_8x64
     ret
 endfunc
 
@@ -623,13 +623,13 @@
 .macro blockcopy_pp_16xN1_neon h
 function PFX(blockcopy_pp_16x\h\()_neon)
     mov             w12, #\h / 8
-.loop_16x\h\():
+.Loop_16x\h\():
 .rept 8
     ld1             {v0.8h}, x2, x3
     st1             {v0.8h}, x0, x1
 .endr
     sub             w12, w12, #1
-    cbnz            w12, .loop_16x\h
+    cbnz            w12, .Loop_16x\h
     ret
 endfunc
 .endm
@@ -651,38 +651,38 @@
 function PFX(blockcopy_pp_12x32_neon)
     sub             x1, x1, #8
     mov             w12, #4
-.loop_pp_12x32:
+.Loop_pp_12x32:
     sub             w12, w12, #1
 .rept 8
     ld1             {v0.16b}, x2, x3
     str             d0, x0, #8

 
@@ -86,7 +86,7 @@
     lsl             x3, x3, #1
     movrel          x11, xtn_xtn2_table
     ld1             {v31.16b}, x11
-.loop_csp32:
+.Loop_csp32:
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.8h-v3.8h}, x2, x3
@@ -98,7 +98,7 @@
     st1             {v0.16b-v1.16b}, x0, x1
     st1             {v2.16b-v3.16b}, x0, x1
 .endr
-    cbnz            w12, .loop_csp32
+    cbnz            w12, .Loop_csp32
     ret
 endfunc
 
@@ -108,7 +108,7 @@
     sub             x3, x3, #64
     movrel          x11, xtn_xtn2_table
     ld1             {v31.16b}, x11
-.loop_csp64:
+.Loop_csp64:
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.8h-v3.8h}, x2, #64
@@ -119,7 +119,7 @@
     tbl             v3.16b, {v6.16b,v7.16b}, v31.16b
     st1             {v0.16b-v3.16b}, x0, x1
 .endr
-    cbnz            w12, .loop_csp64
+    cbnz            w12, .Loop_csp64
     ret
 endfunc
 
@@ -168,7 +168,7 @@
 function PFX(blockcopy_ps_32x32_neon)
     lsl             x1, x1, #1
     mov             w12, #4
-.loop_cps32:
+.Loop_cps32:
     sub             w12, w12, #1
 .rept 4
     ld1             {v16.16b-v17.16b}, x2, x3
@@ -184,7 +184,7 @@
     st1             {v0.8h-v3.8h}, x0, x1
     st1             {v4.8h-v7.8h}, x0, x1
 .endr
-    cbnz            w12, .loop_cps32
+    cbnz            w12, .Loop_cps32
     ret
 endfunc
 
@@ -192,7 +192,7 @@
     lsl             x1, x1, #1
     sub             x1, x1, #64
     mov             w12, #16
-.loop_cps64:
+.Loop_cps64:
     sub             w12, w12, #1
 .rept 4
     ld1             {v16.16b-v19.16b}, x2, x3
@@ -207,7 +207,7 @@
     st1             {v0.8h-v3.8h}, x0, #64
     st1             {v4.8h-v7.8h}, x0, x1
 .endr
-    cbnz            w12, .loop_cps64
+    cbnz            w12, .Loop_cps64
     ret
 endfunc
 
@@ -252,13 +252,13 @@
     lsl             x1, x1, #1
     lsl             x3, x3, #1
     mov             w12, #4
-.loop_css32:
+.Loop_css32:
     sub             w12, w12, #1
 .rept 8
     ld1             {v0.8h-v3.8h}, x2, x3
     st1             {v0.8h-v3.8h}, x0, x1
 .endr
-    cbnz            w12, .loop_css32
+    cbnz            w12, .Loop_css32
     ret
 endfunc
 
@@ -268,7 +268,7 @@
     lsl             x3, x3, #1
     sub             x3, x3, #64
     mov             w12, #8
-.loop_css64:
+.Loop_css64:
     sub             w12, w12, #1
 .rept 8
     ld1             {v0.8h-v3.8h}, x2, #64
@@ -276,7 +276,7 @@
     st1             {v0.8h-v3.8h}, x0, #64
     st1             {v4.8h-v7.8h}, x0, x1
 .endr
-    cbnz            w12, .loop_css64
+    cbnz            w12, .Loop_css64
     ret
 endfunc
 
@@ -321,13 +321,13 @@
     lsl             x1, x1, #1
     lsl             x3, x3, #1
     mov             w12, #8
-.loop_css32x64:
+.Loop_css32x64:
     sub             w12, w12, #1
 .rept 8
     ld1             {v0.8h-v3.8h}, x2, x3
     st1             {v0.8h-v3.8h}, x0, x1
 .endr
-    cbnz            w12, .loop_css32x64
+    cbnz            w12, .Loop_css32x64
     ret
 endfunc
 
@@ -376,7 +376,7 @@
 function PFX(blockcopy_ps_32x64_neon)
     lsl             x1, x1, #1
     mov             w12, #8
-.loop_cps32x64:
+.Loop_cps32x64:
     sub             w12, w12, #1
 .rept 4
     ld1             {v16.16b-v17.16b}, x2, x3
@@ -392,7 +392,7 @@
     st1             {v0.8h-v3.8h}, x0, x1
     st1             {v4.8h-v7.8h}, x0, x1
 .endr
-    cbnz            w12, .loop_cps32x64
+    cbnz            w12, .Loop_cps32x64
     ret
 endfunc
 
@@ -443,7 +443,7 @@
     lsl             x3, x3, #1
     movrel          x11, xtn_xtn2_table
     ld1             {v31.16b}, x11
-.loop_csp32x64:
+.Loop_csp32x64:
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.8h-v3.8h}, x2, x3
@@ -455,7 +455,7 @@
     st1             {v0.16b-v1.16b}, x0, x1
     st1             {v2.16b-v3.16b}, x0, x1
 .endr
-    cbnz            w12, .loop_csp32x64
+    cbnz            w12, .Loop_csp32x64
     ret
 endfunc
 
@@ -595,13 +595,13 @@
 
 function PFX(blockcopy_pp_8x64_neon)
     mov             w12, #4
-.loop_pp_8x64:
+.Loop_pp_8x64:
     sub             w12, w12, #1
 .rept 16
     ld1             {v0.4h}, x2, x3
     st1             {v0.4h}, x0, x1
 .endr
-    cbnz            w12, .loop_pp_8x64
+    cbnz            w12, .Loop_pp_8x64
     ret
 endfunc
 
@@ -623,13 +623,13 @@
 .macro blockcopy_pp_16xN1_neon h
 function PFX(blockcopy_pp_16x\h\()_neon)
     mov             w12, #\h / 8
-.loop_16x\h\():
+.Loop_16x\h\():
 .rept 8
     ld1             {v0.8h}, x2, x3
     st1             {v0.8h}, x0, x1
 .endr
     sub             w12, w12, #1
-    cbnz            w12, .loop_16x\h
+    cbnz            w12, .Loop_16x\h
     ret
 endfunc
 .endm
@@ -651,38 +651,38 @@
 function PFX(blockcopy_pp_12x32_neon)
     sub             x1, x1, #8
     mov             w12, #4
-.loop_pp_12x32:
+.Loop_pp_12x32:
     sub             w12, w12, #1
 .rept 8
     ld1             {v0.16b}, x2, x3
     str             d0, x0, #8
​

x265_4.0.tar.gz/source/common/aarch64/dct-prim-sve.cpp Added

@@ -0,0 +1,491 @@
+/*****************************************************************************
+ * Copyright (C) 2024 MulticoreWare, Inc
+ *
+ * Authors: Hari Limaye <hari.limaye@arm.com>
+ *          Jonathan Wright <jonathan.wright@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "dct-prim.h"
+#include "neon-sve-bridge.h"
+#include <arm_neon.h>
+
+
+namespace
+{
+using namespace X265_NS;
+
+// First four elements (duplicated) of rows 1, 3, 5 and 7 in g_t8 (8x8 DCT
+// matrix.)
+const int16_t t8_odd48 =
+{
+    { 89,  75,  50,  18, 89,  75,  50,  18 },
+    { 75, -18, -89, -50, 75, -18, -89, -50 },
+    { 50, -89,  18,  75, 50, -89,  18,  75 },
+    { 18, -50,  75, -89, 18, -50,  75, -89 },
+};
+
+template<int shift>
+static inline void partialButterfly8_sve(const int16_t *src, int16_t *dst)
+{
+    const int line = 8;
+
+    int16x8_t Oline / 2;
+    int32x4_t EEline / 2;
+    int32x4_t EOline / 2;
+
+    for (int i = 0; i < line; i += 2)
+    {
+        int16x8_t s_lo = vcombine_s16(vld1_s16(src + i * line),
+                                      vld1_s16(src + (i + 1) * line));
+        int16x8_t s_hi = vcombine_s16(
+            vrev64_s16(vld1_s16(src + i * line + 4)),
+            vrev64_s16(vld1_s16(src + (i + 1) * line + 4)));
+
+        int32x4_t E0 = vaddl_s16(vget_low_s16(s_lo), vget_low_s16(s_hi));
+        int32x4_t E1 = vaddl_s16(vget_high_s16(s_lo), vget_high_s16(s_hi));
+
+        Oi / 2 = vsubq_s16(s_lo, s_hi);
+
+        int32x4_t t0 = vreinterpretq_s32_s64(
+            vzip1q_s64(vreinterpretq_s64_s32(E0), vreinterpretq_s64_s32(E1)));
+        int32x4_t t1 = vrev64q_s32(vreinterpretq_s32_s64(
+            vzip2q_s64(vreinterpretq_s64_s32(E0), vreinterpretq_s64_s32(E1))));
+
+        EEi / 2 = vaddq_s32(t0, t1);
+        EOi / 2 = vsubq_s32(t0, t1);
+    }
+
+    int16_t *d = dst;
+
+    int32x4_t c0 = vld1q_s32(t8_even0);
+    int32x4_t c2 = vld1q_s32(t8_even1);
+    int32x4_t c4 = vld1q_s32(t8_even2);
+    int32x4_t c6 = vld1q_s32(t8_even3);
+    int16x8_t c1 = vld1q_s16(t8_odd0);
+    int16x8_t c3 = vld1q_s16(t8_odd1);
+    int16x8_t c5 = vld1q_s16(t8_odd2);
+    int16x8_t c7 = vld1q_s16(t8_odd3);
+
+    for (int j = 0; j < line; j += 4)
+    {
+        // O
+        int64x2_t t01 = x265_sdotq_s16(vdupq_n_s64(0), Oj / 2 + 0, c1);
+        int64x2_t t23 = x265_sdotq_s16(vdupq_n_s64(0), Oj / 2 + 1, c1);
+        int32x4_t t0123 = vcombine_s32(vmovn_s64(t01), vmovn_s64(t23));
+        int16x4_t res1 = vrshrn_n_s32(t0123, shift);
+        vst1_s16(d + 1 * line, res1);
+
+        t01 = x265_sdotq_s16(vdupq_n_s64(0), Oj / 2 + 0, c3);
+        t23 = x265_sdotq_s16(vdupq_n_s64(0), Oj / 2 + 1, c3);
+        t0123 = vcombine_s32(vmovn_s64(t01), vmovn_s64(t23));
+        int16x4_t res3 = vrshrn_n_s32(t0123, shift);
+        vst1_s16(d + 3 * line, res3);
+
+        t01 = x265_sdotq_s16(vdupq_n_s64(0), Oj / 2 + 0, c5);
+        t23 = x265_sdotq_s16(vdupq_n_s64(0), Oj / 2 + 1, c5);
+        t0123 = vcombine_s32(vmovn_s64(t01), vmovn_s64(t23));
+        int16x4_t res5 = vrshrn_n_s32(t0123, shift);
+        vst1_s16(d + 5 * line, res5);
+
+        t01 = x265_sdotq_s16(vdupq_n_s64(0), Oj / 2 + 0, c7);
+        t23 = x265_sdotq_s16(vdupq_n_s64(0), Oj / 2 + 1, c7);
+        t0123 = vcombine_s32(vmovn_s64(t01), vmovn_s64(t23));
+        int16x4_t res7 = vrshrn_n_s32(t0123, shift);
+        vst1_s16(d + 7 * line, res7);
+
+        // EE and EO
+        int32x4_t t0 = vpaddq_s32(EEj / 2 + 0, EEj / 2 + 1);
+        int32x4_t t1 = vmulq_s32(c0, t0);
+        int16x4_t res0 = vrshrn_n_s32(t1, shift);
+        vst1_s16(d + 0 * line, res0);
+
+        int32x4_t t2 = vmulq_s32(c2, EOj / 2 + 0);
+        int32x4_t t3 = vmulq_s32(c2, EOj / 2 + 1);
+        int16x4_t res2 = vrshrn_n_s32(vpaddq_s32(t2, t3), shift);
+        vst1_s16(d + 2 * line, res2);
+
+        int32x4_t t4 = vmulq_s32(c4, EEj / 2 + 0);
+        int32x4_t t5 = vmulq_s32(c4, EEj / 2 + 1);
+        int16x4_t res4 = vrshrn_n_s32(vpaddq_s32(t4, t5), shift);
+        vst1_s16(d + 4 * line, res4);
+
+        int32x4_t t6 = vmulq_s32(c6, EOj / 2 + 0);
+        int32x4_t t7 = vmulq_s32(c6, EOj / 2 + 1);
+        int16x4_t res6 = vrshrn_n_s32(vpaddq_s32(t6, t7), shift);
+        vst1_s16(d + 6 * line, res6);
+
+        d += 4;
+    }
+}
+
+template<int shift>
+static inline void partialButterfly16_sve(const int16_t *src, int16_t *dst)
+{
+    const int line = 16;
+
+    int16x8_t Oline;
+    int16x8_t EOline / 2;
+    int32x4_t EEEline;
+    int32x4_t EEOline;
+
+    for (int i = 0; i < line; i += 2)
+    {
+        int16x8_t s0_lo = vld1q_s16(src + i * line);
+        int16x8_t s0_hi = rev16(vld1q_s16(src + i * line + 8));
+
+        int16x8_t s1_lo = vld1q_s16(src + (i + 1) * line);
+        int16x8_t s1_hi = rev16(vld1q_s16(src + (i + 1) * line + 8));
+
+        int32x4_t E02;
+        E00 = vaddl_s16(vget_low_s16(s0_lo), vget_low_s16(s0_hi));
+        E01 = vaddl_s16(vget_high_s16(s0_lo), vget_high_s16(s0_hi));
+
+        int32x4_t E12;
+        E10 = vaddl_s16(vget_low_s16(s1_lo), vget_low_s16(s1_hi));
+        E11 = vaddl_s16(vget_high_s16(s1_lo), vget_high_s16(s1_hi));
+
+        Oi + 0 = vsubq_s16(s0_lo, s0_hi);
+        Oi + 1 = vsubq_s16(s1_lo, s1_hi);
+
+        int16x4_t EO_lo = vmovn_s32(vsubq_s32(E00, rev32(E01)));
+        int16x4_t EO_hi = vmovn_s32(vsubq_s32(E10, rev32(E11)));
+        EOi / 2 = vcombine_s16(EO_lo, EO_hi);
+
+        int32x4_t EE0 = vaddq_s32(E00, rev32(E01));
+        int32x4_t EE1 = vaddq_s32(E10, rev32(E11));
+
+        int32x4_t t0 = vreinterpretq_s32_s64(
+            vzip1q_s64(vreinterpretq_s64_s32(EE0), vreinterpretq_s64_s32(EE1)));
+        int32x4_t t1 = vrev64q_s32(vreinterpretq_s32_s64(
+            vzip2q_s64(vreinterpretq_s64_s32(EE0),
+                       vreinterpretq_s64_s32(EE1))));
+
+        EEEi / 2 = vaddq_s32(t0, t1);
+        EEOi / 2 = vsubq_s32(t0, t1);
+    }
+
+    for (int i = 0; i < line; i += 4)
+    {
+        for (int k = 1; k < 16; k += 2)
+        {
+            int16x8_t c0_c4 = vld1q_s16(&g_t16k0);
+
+            int64x2_t t0 = x265_sdotq_s16(vdupq_n_s64(0), c0_c4, Oi + 0);
+            int64x2_t t1 = x265_sdotq_s16(vdupq_n_s64(0), c0_c4, Oi + 1);
+            int64x2_t t2 = x265_sdotq_s16(vdupq_n_s64(0), c0_c4, Oi + 2);
+            int64x2_t t3 = x265_sdotq_s16(vdupq_n_s64(0), c0_c4, Oi + 3);
+
+            int32x4_t t01 = vcombine_s32(vmovn_s64(t0), vmovn_s64(t1));
+            int32x4_t t23 = vcombine_s32(vmovn_s64(t2), vmovn_s64(t3));
+            int16x4_t res = vrshrn_n_s32(vpaddq_s32(t01, t23), shift);
+            vst1_s16(dst + k * line, res);
+        }

 
@@ -0,0 +1,491 @@
+/*****************************************************************************
+ * Copyright (C) 2024 MulticoreWare, Inc
+ *
+ * Authors: Hari Limaye <hari.limaye@arm.com>
+ *          Jonathan Wright <jonathan.wright@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "dct-prim.h"
+#include "neon-sve-bridge.h"
+#include <arm_neon.h>
+
+
+namespace
+{
+using namespace X265_NS;
+
+// First four elements (duplicated) of rows 1, 3, 5 and 7 in g_t8 (8x8 DCT
+// matrix.)
+const int16_t t8_odd48 =
+{
+    { 89,  75,  50,  18, 89,  75,  50,  18 },
+    { 75, -18, -89, -50, 75, -18, -89, -50 },
+    { 50, -89,  18,  75, 50, -89,  18,  75 },
+    { 18, -50,  75, -89, 18, -50,  75, -89 },
+};
+
+template<int shift>
+static inline void partialButterfly8_sve(const int16_t *src, int16_t *dst)
+{
+    const int line = 8;
+
+    int16x8_t Oline / 2;
+    int32x4_t EEline / 2;
+    int32x4_t EOline / 2;
+
+    for (int i = 0; i < line; i += 2)
+    {
+        int16x8_t s_lo = vcombine_s16(vld1_s16(src + i * line),
+                                      vld1_s16(src + (i + 1) * line));
+        int16x8_t s_hi = vcombine_s16(
+            vrev64_s16(vld1_s16(src + i * line + 4)),
+            vrev64_s16(vld1_s16(src + (i + 1) * line + 4)));
+
+        int32x4_t E0 = vaddl_s16(vget_low_s16(s_lo), vget_low_s16(s_hi));
+        int32x4_t E1 = vaddl_s16(vget_high_s16(s_lo), vget_high_s16(s_hi));
+
+        Oi / 2 = vsubq_s16(s_lo, s_hi);
+
+        int32x4_t t0 = vreinterpretq_s32_s64(
+            vzip1q_s64(vreinterpretq_s64_s32(E0), vreinterpretq_s64_s32(E1)));
+        int32x4_t t1 = vrev64q_s32(vreinterpretq_s32_s64(
+            vzip2q_s64(vreinterpretq_s64_s32(E0), vreinterpretq_s64_s32(E1))));
+
+        EEi / 2 = vaddq_s32(t0, t1);
+        EOi / 2 = vsubq_s32(t0, t1);
+    }
+
+    int16_t *d = dst;
+
+    int32x4_t c0 = vld1q_s32(t8_even0);
+    int32x4_t c2 = vld1q_s32(t8_even1);
+    int32x4_t c4 = vld1q_s32(t8_even2);
+    int32x4_t c6 = vld1q_s32(t8_even3);
+    int16x8_t c1 = vld1q_s16(t8_odd0);
+    int16x8_t c3 = vld1q_s16(t8_odd1);
+    int16x8_t c5 = vld1q_s16(t8_odd2);
+    int16x8_t c7 = vld1q_s16(t8_odd3);
+
+    for (int j = 0; j < line; j += 4)
+    {
+        // O
+        int64x2_t t01 = x265_sdotq_s16(vdupq_n_s64(0), Oj / 2 + 0, c1);
+        int64x2_t t23 = x265_sdotq_s16(vdupq_n_s64(0), Oj / 2 + 1, c1);
+        int32x4_t t0123 = vcombine_s32(vmovn_s64(t01), vmovn_s64(t23));
+        int16x4_t res1 = vrshrn_n_s32(t0123, shift);
+        vst1_s16(d + 1 * line, res1);
+
+        t01 = x265_sdotq_s16(vdupq_n_s64(0), Oj / 2 + 0, c3);
+        t23 = x265_sdotq_s16(vdupq_n_s64(0), Oj / 2 + 1, c3);
+        t0123 = vcombine_s32(vmovn_s64(t01), vmovn_s64(t23));
+        int16x4_t res3 = vrshrn_n_s32(t0123, shift);
+        vst1_s16(d + 3 * line, res3);
+
+        t01 = x265_sdotq_s16(vdupq_n_s64(0), Oj / 2 + 0, c5);
+        t23 = x265_sdotq_s16(vdupq_n_s64(0), Oj / 2 + 1, c5);
+        t0123 = vcombine_s32(vmovn_s64(t01), vmovn_s64(t23));
+        int16x4_t res5 = vrshrn_n_s32(t0123, shift);
+        vst1_s16(d + 5 * line, res5);
+
+        t01 = x265_sdotq_s16(vdupq_n_s64(0), Oj / 2 + 0, c7);
+        t23 = x265_sdotq_s16(vdupq_n_s64(0), Oj / 2 + 1, c7);
+        t0123 = vcombine_s32(vmovn_s64(t01), vmovn_s64(t23));
+        int16x4_t res7 = vrshrn_n_s32(t0123, shift);
+        vst1_s16(d + 7 * line, res7);
+
+        // EE and EO
+        int32x4_t t0 = vpaddq_s32(EEj / 2 + 0, EEj / 2 + 1);
+        int32x4_t t1 = vmulq_s32(c0, t0);
+        int16x4_t res0 = vrshrn_n_s32(t1, shift);
+        vst1_s16(d + 0 * line, res0);
+
+        int32x4_t t2 = vmulq_s32(c2, EOj / 2 + 0);
+        int32x4_t t3 = vmulq_s32(c2, EOj / 2 + 1);
+        int16x4_t res2 = vrshrn_n_s32(vpaddq_s32(t2, t3), shift);
+        vst1_s16(d + 2 * line, res2);
+
+        int32x4_t t4 = vmulq_s32(c4, EEj / 2 + 0);
+        int32x4_t t5 = vmulq_s32(c4, EEj / 2 + 1);
+        int16x4_t res4 = vrshrn_n_s32(vpaddq_s32(t4, t5), shift);
+        vst1_s16(d + 4 * line, res4);
+
+        int32x4_t t6 = vmulq_s32(c6, EOj / 2 + 0);
+        int32x4_t t7 = vmulq_s32(c6, EOj / 2 + 1);
+        int16x4_t res6 = vrshrn_n_s32(vpaddq_s32(t6, t7), shift);
+        vst1_s16(d + 6 * line, res6);
+
+        d += 4;
+    }
+}
+
+template<int shift>
+static inline void partialButterfly16_sve(const int16_t *src, int16_t *dst)
+{
+    const int line = 16;
+
+    int16x8_t Oline;
+    int16x8_t EOline / 2;
+    int32x4_t EEEline;
+    int32x4_t EEOline;
+
+    for (int i = 0; i < line; i += 2)
+    {
+        int16x8_t s0_lo = vld1q_s16(src + i * line);
+        int16x8_t s0_hi = rev16(vld1q_s16(src + i * line + 8));
+
+        int16x8_t s1_lo = vld1q_s16(src + (i + 1) * line);
+        int16x8_t s1_hi = rev16(vld1q_s16(src + (i + 1) * line + 8));
+
+        int32x4_t E02;
+        E00 = vaddl_s16(vget_low_s16(s0_lo), vget_low_s16(s0_hi));
+        E01 = vaddl_s16(vget_high_s16(s0_lo), vget_high_s16(s0_hi));
+
+        int32x4_t E12;
+        E10 = vaddl_s16(vget_low_s16(s1_lo), vget_low_s16(s1_hi));
+        E11 = vaddl_s16(vget_high_s16(s1_lo), vget_high_s16(s1_hi));
+
+        Oi + 0 = vsubq_s16(s0_lo, s0_hi);
+        Oi + 1 = vsubq_s16(s1_lo, s1_hi);
+
+        int16x4_t EO_lo = vmovn_s32(vsubq_s32(E00, rev32(E01)));
+        int16x4_t EO_hi = vmovn_s32(vsubq_s32(E10, rev32(E11)));
+        EOi / 2 = vcombine_s16(EO_lo, EO_hi);
+
+        int32x4_t EE0 = vaddq_s32(E00, rev32(E01));
+        int32x4_t EE1 = vaddq_s32(E10, rev32(E11));
+
+        int32x4_t t0 = vreinterpretq_s32_s64(
+            vzip1q_s64(vreinterpretq_s64_s32(EE0), vreinterpretq_s64_s32(EE1)));
+        int32x4_t t1 = vrev64q_s32(vreinterpretq_s32_s64(
+            vzip2q_s64(vreinterpretq_s64_s32(EE0),
+                       vreinterpretq_s64_s32(EE1))));
+
+        EEEi / 2 = vaddq_s32(t0, t1);
+        EEOi / 2 = vsubq_s32(t0, t1);
+    }
+
+    for (int i = 0; i < line; i += 4)
+    {
+        for (int k = 1; k < 16; k += 2)
+        {
+            int16x8_t c0_c4 = vld1q_s16(&g_t16k0);
+
+            int64x2_t t0 = x265_sdotq_s16(vdupq_n_s64(0), c0_c4, Oi + 0);
+            int64x2_t t1 = x265_sdotq_s16(vdupq_n_s64(0), c0_c4, Oi + 1);
+            int64x2_t t2 = x265_sdotq_s16(vdupq_n_s64(0), c0_c4, Oi + 2);
+            int64x2_t t3 = x265_sdotq_s16(vdupq_n_s64(0), c0_c4, Oi + 3);
+
+            int32x4_t t01 = vcombine_s32(vmovn_s64(t0), vmovn_s64(t1));
+            int32x4_t t23 = vcombine_s32(vmovn_s64(t2), vmovn_s64(t3));
+            int16x4_t res = vrshrn_n_s32(vpaddq_s32(t01, t23), shift);
+            vst1_s16(dst + k * line, res);
+        }
​

x265_3.6.tar.gz/source/common/aarch64/dct-prim.cpp -> x265_4.0.tar.gz/source/common/aarch64/dct-prim.cpp Changed

@@ -5,36 +5,35 @@
 
 #include <arm_neon.h>
 
+#define X265_PRAGMA(text)       _Pragma(#text)
+#if defined(__clang__)
+#define X265_PRAGMA_UNROLL(n)   X265_PRAGMA(unroll(n))
+#elif defined(__GNUC__)
+#define X265_PRAGMA_UNROLL(n)   X265_PRAGMA(GCC unroll (n))
+#else
+#define X265_PRAGMA_UNROLL(n)
+#endif
+
+extern "C" void PFX(dct16_neon)(const int16_t *src, int16_t *dst, intptr_t srcStride);
+extern "C" void PFX(idct16_neon)(const int16_t *src, int16_t *dst, intptr_t dstStride);
 
 namespace
 {
 using namespace X265_NS;
 
-
-static int16x8_t rev16(const int16x8_t a)
+static void transpose_4x4x16(int16x4_t &x0, int16x4_t &x1, int16x4_t &x2, int16x4_t &x3)
 {
-    static const int8x16_t tbl = {14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1};
-    return vqtbx1q_u8(a, a, tbl);
-}
+    int32x2_t s0, s1, s2, s3;
 
-static int32x4_t rev32(const int32x4_t a)
-{
-    static const int8x16_t tbl = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3};
-    return vqtbx1q_u8(a, a, tbl);
-}
+    s0 = vtrn1_s32(vreinterpret_s32_s16(x0), vreinterpret_s32_s16(x2));
+    s1 = vtrn1_s32(vreinterpret_s32_s16(x1), vreinterpret_s32_s16(x3));
+    s2 = vtrn2_s32(vreinterpret_s32_s16(x0), vreinterpret_s32_s16(x2));
+    s3 = vtrn2_s32(vreinterpret_s32_s16(x1), vreinterpret_s32_s16(x3));
 
-static void transpose_4x4x16(int16x4_t &x0, int16x4_t &x1, int16x4_t &x2, int16x4_t &x3)
-{
-    int16x4_t s0, s1, s2, s3;
-    s0 = vtrn1_s32(x0, x2);
-    s1 = vtrn1_s32(x1, x3);
-    s2 = vtrn2_s32(x0, x2);
-    s3 = vtrn2_s32(x1, x3);
-
-    x0 = vtrn1_s16(s0, s1);
-    x1 = vtrn2_s16(s0, s1);
-    x2 = vtrn1_s16(s2, s3);
-    x3 = vtrn2_s16(s2, s3);
+    x0 = vtrn1_s16(vreinterpret_s16_s32(s0), vreinterpret_s16_s32(s1));
+    x1 = vtrn2_s16(vreinterpret_s16_s32(s0), vreinterpret_s16_s32(s1));
+    x2 = vtrn1_s16(vreinterpret_s16_s32(s2), vreinterpret_s16_s32(s3));
+    x3 = vtrn2_s16(vreinterpret_s16_s32(s2), vreinterpret_s16_s32(s3));
 }
 
 
@@ -111,13 +110,13 @@
     int64x2_t vcost_sum_1 = vdupq_n_s64(0);
     for (int y = 0; y < MLS_CG_SIZE; y++)
     {
-        int16x4_t in = *(int16x4_t *)&m_resiDctCoeffblkPos;
+        int16x4_t in = vld1_s16(&m_resiDctCoeffblkPos);
         int32x4_t mul = vmull_s16(in, in);
         int64x2_t cost0, cost1;
         cost0 = vshll_n_s32(vget_low_s32(mul), scaleBits);
         cost1 = vshll_high_n_s32(mul, scaleBits);
-        *(int64x2_t *)&costUncodedblkPos + 0 = cost0;
-        *(int64x2_t *)&costUncodedblkPos + 2 = cost1;
+        vst1q_s64(&costUncodedblkPos + 0, cost0);
+        vst1q_s64(&costUncodedblkPos + 2, cost1);
         vcost_sum_0 = vaddq_s64(vcost_sum_0, cost0);
         vcost_sum_1 = vaddq_s64(vcost_sum_1, cost1);
         blkPos += trSize;
@@ -143,8 +142,9 @@
     int32x4_t vpsy = vdupq_n_s32(*psyScale);
     for (int y = 0; y < MLS_CG_SIZE; y++)
     {
-        int32x4_t signCoef = vmovl_s16(*(int16x4_t *)&m_resiDctCoeffblkPos);
-        int32x4_t predictedCoef = vsubq_s32(vmovl_s16(*(int16x4_t *)&m_fencDctCoeffblkPos), signCoef);
+        int32x4_t signCoef = vmovl_s16(vld1_s16(&m_resiDctCoeffblkPos));
+        int32x4_t fencCoef = vmovl_s16(vld1_s16(&m_fencDctCoeffblkPos));
+        int32x4_t predictedCoef = vsubq_s32(fencCoef, signCoef);
         int64x2_t cost0, cost1;
         cost0 = vmull_s32(vget_low_s32(signCoef), vget_low_s32(signCoef));
         cost1 = vmull_high_s32(signCoef, signCoef);
@@ -160,8 +160,8 @@
         }
         cost0 = vsubq_s64(cost0, neg0);
         cost1 = vsubq_s64(cost1, neg1);
-        *(int64x2_t *)&costUncodedblkPos + 0 = cost0;
-        *(int64x2_t *)&costUncodedblkPos + 2 = cost1;
+        vst1q_s64(&costUncodedblkPos + 0, cost0);
+        vst1q_s64(&costUncodedblkPos + 2, cost1);
         vcost_sum_0 = vaddq_s64(vcost_sum_0, cost0);
         vcost_sum_1 = vaddq_s64(vcost_sum_1, cost1);
 
@@ -188,8 +188,9 @@
     int i = 0;
     for (; (i + 8) <= numCoeff; i += 8)
     {
-        int16x8_t in = *(int16x8_t *)&quantCoeffi;
-        vcount = vaddq_s16(vcount, vtstq_s16(in, in));
+        int16x8_t in = vld1q_s16(&quantCoeffi);
+        uint16x8_t tst = vtstq_s16(in, in);
+        vcount = vaddq_s16(vcount, vreinterpretq_s16_u16(tst));
     }
     for (; i < numCoeff; i++)
     {
@@ -209,9 +210,10 @@
         int j = 0;
         for (; (j + 8) <= trSize; j += 8)
         {
-            int16x8_t in = *(int16x8_t *)&residualj;
-            *(int16x8_t *)&coeffj = in;
-            vcount = vaddq_s16(vcount, vtstq_s16(in, in));
+            int16x8_t in = vld1q_s16(&residualj);
+            vst1q_s16(&coeffj, in);
+            uint16x8_t tst = vtstq_s16(in, in);
+            vcount = vaddq_s16(vcount, vreinterpretq_s16_u16(tst));
         }
         for (; j < trSize; j++)
         {
@@ -225,200 +227,396 @@
     return numSig - vaddvq_s16(vcount);
 }
 
-
-static void partialButterfly16(const int16_t *src, int16_t *dst, int shift, int line)
+template<int shift>
+static inline void partialButterfly16_neon(const int16_t *src, int16_t *dst)
 {
-    int j, k;
-    int32x4_t E2, O2;
-    int32x4_t EE, EO;
-    int32x2_t EEE, EEO;
-    const int add = 1 << (shift - 1);
-    const int32x4_t _vadd = {add, 0};
+    const int line = 16;
 
-    for (j = 0; j < line; j++)
+    int16x8_t Oline;
+    int32x4_t EOline;
+    int32x4_t EEEline;
+    int32x4_t EEOline;
+
+    for (int i = 0; i < line; i += 2)
     {
-        int16x8_t in0 = *(int16x8_t *)src;
-        int16x8_t in1 = rev16(*(int16x8_t *)&src8);
+        int16x8_t s0_lo = vld1q_s16(src + i * line);
+        int16x8_t s0_hi = rev16(vld1q_s16(src + i * line + 8));
 
-        E0 = vaddl_s16(vget_low_s16(in0), vget_low_s16(in1));
-        O0 = vsubl_s16(vget_low_s16(in0), vget_low_s16(in1));
-        E1 = vaddl_high_s16(in0, in1);
-        O1 = vsubl_high_s16(in0, in1);
+        int16x8_t s1_lo = vld1q_s16(src + (i + 1) * line);
+        int16x8_t s1_hi = rev16(vld1q_s16(src + (i + 1) * line + 8));
 
-        for (k = 1; k < 16; k += 2)
-        {
-            int32x4_t c0 = vmovl_s16(*(int16x4_t *)&g_t16k0);
-            int32x4_t c1 = vmovl_s16(*(int16x4_t *)&g_t16k4);
+        int32x4_t E02;
+        E00 = vaddl_s16(vget_low_s16(s0_lo), vget_low_s16(s0_hi));
+        E01 = vaddl_s16(vget_high_s16(s0_lo), vget_high_s16(s0_hi));
 
-            int32x4_t res = _vadd;
-            res = vmlaq_s32(res, c0, O0);
-            res = vmlaq_s32(res, c1, O1);
-            dstk * line = (int16_t)(vaddvq_s32(res) >> shift);
-        }
+        int32x4_t E12;
+        E10 = vaddl_s16(vget_low_s16(s1_lo), vget_low_s16(s1_hi));
+        E11 = vaddl_s16(vget_high_s16(s1_lo), vget_high_s16(s1_hi));
+
+        Oi + 0 = vsubq_s16(s0_lo, s0_hi);
+        Oi + 1 = vsubq_s16(s1_lo, s1_hi);
+
+        int32x4_t EE0 = vaddq_s32(E00, rev32(E01));
+        int32x4_t EE1 = vaddq_s32(E10, rev32(E11));
+        EOi + 0 = vsubq_s32(E00, rev32(E01));
+        EOi + 1 = vsubq_s32(E10, rev32(E11));
+
+        int32x4_t t0 = vreinterpretq_s32_s64(
+            vzip1q_s64(vreinterpretq_s64_s32(EE0), vreinterpretq_s64_s32(EE1)));
+        int32x4_t t1 = vrev64q_s32(vreinterpretq_s32_s64(vzip2q_s64(
+            vreinterpretq_s64_s32(EE0), vreinterpretq_s64_s32(EE1))));
 
-        /* EE and EO */
-        EE = vaddq_s32(E0, rev32(E1));
-        EO = vsubq_s32(E0, rev32(E1));
 
-        for (k = 2; k < 16; k += 4)
+        EEEi / 2 = vaddq_s32(t0, t1);
+        EEOi / 2 = vsubq_s32(t0, t1);
+    }
+

 
@@ -5,36 +5,35 @@
 
 #include <arm_neon.h>
 
+#define X265_PRAGMA(text)       _Pragma(#text)
+#if defined(__clang__)
+#define X265_PRAGMA_UNROLL(n)   X265_PRAGMA(unroll(n))
+#elif defined(__GNUC__)
+#define X265_PRAGMA_UNROLL(n)   X265_PRAGMA(GCC unroll (n))
+#else
+#define X265_PRAGMA_UNROLL(n)
+#endif
+
+extern "C" void PFX(dct16_neon)(const int16_t *src, int16_t *dst, intptr_t srcStride);
+extern "C" void PFX(idct16_neon)(const int16_t *src, int16_t *dst, intptr_t dstStride);
 
 namespace
 {
 using namespace X265_NS;
 
-
-static int16x8_t rev16(const int16x8_t a)
+static void transpose_4x4x16(int16x4_t &x0, int16x4_t &x1, int16x4_t &x2, int16x4_t &x3)
 {
-    static const int8x16_t tbl = {14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1};
-    return vqtbx1q_u8(a, a, tbl);
-}
+    int32x2_t s0, s1, s2, s3;
 
-static int32x4_t rev32(const int32x4_t a)
-{
-    static const int8x16_t tbl = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3};
-    return vqtbx1q_u8(a, a, tbl);
-}
+    s0 = vtrn1_s32(vreinterpret_s32_s16(x0), vreinterpret_s32_s16(x2));
+    s1 = vtrn1_s32(vreinterpret_s32_s16(x1), vreinterpret_s32_s16(x3));
+    s2 = vtrn2_s32(vreinterpret_s32_s16(x0), vreinterpret_s32_s16(x2));
+    s3 = vtrn2_s32(vreinterpret_s32_s16(x1), vreinterpret_s32_s16(x3));
 
-static void transpose_4x4x16(int16x4_t &x0, int16x4_t &x1, int16x4_t &x2, int16x4_t &x3)
-{
-    int16x4_t s0, s1, s2, s3;
-    s0 = vtrn1_s32(x0, x2);
-    s1 = vtrn1_s32(x1, x3);
-    s2 = vtrn2_s32(x0, x2);
-    s3 = vtrn2_s32(x1, x3);
-
-    x0 = vtrn1_s16(s0, s1);
-    x1 = vtrn2_s16(s0, s1);
-    x2 = vtrn1_s16(s2, s3);
-    x3 = vtrn2_s16(s2, s3);
+    x0 = vtrn1_s16(vreinterpret_s16_s32(s0), vreinterpret_s16_s32(s1));
+    x1 = vtrn2_s16(vreinterpret_s16_s32(s0), vreinterpret_s16_s32(s1));
+    x2 = vtrn1_s16(vreinterpret_s16_s32(s2), vreinterpret_s16_s32(s3));
+    x3 = vtrn2_s16(vreinterpret_s16_s32(s2), vreinterpret_s16_s32(s3));
 }
 
 
@@ -111,13 +110,13 @@
     int64x2_t vcost_sum_1 = vdupq_n_s64(0);
     for (int y = 0; y < MLS_CG_SIZE; y++)
     {
-        int16x4_t in = *(int16x4_t *)&m_resiDctCoeffblkPos;
+        int16x4_t in = vld1_s16(&m_resiDctCoeffblkPos);
         int32x4_t mul = vmull_s16(in, in);
         int64x2_t cost0, cost1;
         cost0 = vshll_n_s32(vget_low_s32(mul), scaleBits);
         cost1 = vshll_high_n_s32(mul, scaleBits);
-        *(int64x2_t *)&costUncodedblkPos + 0 = cost0;
-        *(int64x2_t *)&costUncodedblkPos + 2 = cost1;
+        vst1q_s64(&costUncodedblkPos + 0, cost0);
+        vst1q_s64(&costUncodedblkPos + 2, cost1);
         vcost_sum_0 = vaddq_s64(vcost_sum_0, cost0);
         vcost_sum_1 = vaddq_s64(vcost_sum_1, cost1);
         blkPos += trSize;
@@ -143,8 +142,9 @@
     int32x4_t vpsy = vdupq_n_s32(*psyScale);
     for (int y = 0; y < MLS_CG_SIZE; y++)
     {
-        int32x4_t signCoef = vmovl_s16(*(int16x4_t *)&m_resiDctCoeffblkPos);
-        int32x4_t predictedCoef = vsubq_s32(vmovl_s16(*(int16x4_t *)&m_fencDctCoeffblkPos), signCoef);
+        int32x4_t signCoef = vmovl_s16(vld1_s16(&m_resiDctCoeffblkPos));
+        int32x4_t fencCoef = vmovl_s16(vld1_s16(&m_fencDctCoeffblkPos));
+        int32x4_t predictedCoef = vsubq_s32(fencCoef, signCoef);
         int64x2_t cost0, cost1;
         cost0 = vmull_s32(vget_low_s32(signCoef), vget_low_s32(signCoef));
         cost1 = vmull_high_s32(signCoef, signCoef);
@@ -160,8 +160,8 @@
         }
         cost0 = vsubq_s64(cost0, neg0);
         cost1 = vsubq_s64(cost1, neg1);
-        *(int64x2_t *)&costUncodedblkPos + 0 = cost0;
-        *(int64x2_t *)&costUncodedblkPos + 2 = cost1;
+        vst1q_s64(&costUncodedblkPos + 0, cost0);
+        vst1q_s64(&costUncodedblkPos + 2, cost1);
         vcost_sum_0 = vaddq_s64(vcost_sum_0, cost0);
         vcost_sum_1 = vaddq_s64(vcost_sum_1, cost1);
 
@@ -188,8 +188,9 @@
     int i = 0;
     for (; (i + 8) <= numCoeff; i += 8)
     {
-        int16x8_t in = *(int16x8_t *)&quantCoeffi;
-        vcount = vaddq_s16(vcount, vtstq_s16(in, in));
+        int16x8_t in = vld1q_s16(&quantCoeffi);
+        uint16x8_t tst = vtstq_s16(in, in);
+        vcount = vaddq_s16(vcount, vreinterpretq_s16_u16(tst));
     }
     for (; i < numCoeff; i++)
     {
@@ -209,9 +210,10 @@
         int j = 0;
         for (; (j + 8) <= trSize; j += 8)
         {
-            int16x8_t in = *(int16x8_t *)&residualj;
-            *(int16x8_t *)&coeffj = in;
-            vcount = vaddq_s16(vcount, vtstq_s16(in, in));
+            int16x8_t in = vld1q_s16(&residualj);
+            vst1q_s16(&coeffj, in);
+            uint16x8_t tst = vtstq_s16(in, in);
+            vcount = vaddq_s16(vcount, vreinterpretq_s16_u16(tst));
         }
         for (; j < trSize; j++)
         {
@@ -225,200 +227,396 @@
     return numSig - vaddvq_s16(vcount);
 }
 
-
-static void partialButterfly16(const int16_t *src, int16_t *dst, int shift, int line)
+template<int shift>
+static inline void partialButterfly16_neon(const int16_t *src, int16_t *dst)
 {
-    int j, k;
-    int32x4_t E2, O2;
-    int32x4_t EE, EO;
-    int32x2_t EEE, EEO;
-    const int add = 1 << (shift - 1);
-    const int32x4_t _vadd = {add, 0};
+    const int line = 16;
 
-    for (j = 0; j < line; j++)
+    int16x8_t Oline;
+    int32x4_t EOline;
+    int32x4_t EEEline;
+    int32x4_t EEOline;
+
+    for (int i = 0; i < line; i += 2)
     {
-        int16x8_t in0 = *(int16x8_t *)src;
-        int16x8_t in1 = rev16(*(int16x8_t *)&src8);
+        int16x8_t s0_lo = vld1q_s16(src + i * line);
+        int16x8_t s0_hi = rev16(vld1q_s16(src + i * line + 8));
 
-        E0 = vaddl_s16(vget_low_s16(in0), vget_low_s16(in1));
-        O0 = vsubl_s16(vget_low_s16(in0), vget_low_s16(in1));
-        E1 = vaddl_high_s16(in0, in1);
-        O1 = vsubl_high_s16(in0, in1);
+        int16x8_t s1_lo = vld1q_s16(src + (i + 1) * line);
+        int16x8_t s1_hi = rev16(vld1q_s16(src + (i + 1) * line + 8));
 
-        for (k = 1; k < 16; k += 2)
-        {
-            int32x4_t c0 = vmovl_s16(*(int16x4_t *)&g_t16k0);
-            int32x4_t c1 = vmovl_s16(*(int16x4_t *)&g_t16k4);
+        int32x4_t E02;
+        E00 = vaddl_s16(vget_low_s16(s0_lo), vget_low_s16(s0_hi));
+        E01 = vaddl_s16(vget_high_s16(s0_lo), vget_high_s16(s0_hi));
 
-            int32x4_t res = _vadd;
-            res = vmlaq_s32(res, c0, O0);
-            res = vmlaq_s32(res, c1, O1);
-            dstk * line = (int16_t)(vaddvq_s32(res) >> shift);
-        }
+        int32x4_t E12;
+        E10 = vaddl_s16(vget_low_s16(s1_lo), vget_low_s16(s1_hi));
+        E11 = vaddl_s16(vget_high_s16(s1_lo), vget_high_s16(s1_hi));
+
+        Oi + 0 = vsubq_s16(s0_lo, s0_hi);
+        Oi + 1 = vsubq_s16(s1_lo, s1_hi);
+
+        int32x4_t EE0 = vaddq_s32(E00, rev32(E01));
+        int32x4_t EE1 = vaddq_s32(E10, rev32(E11));
+        EOi + 0 = vsubq_s32(E00, rev32(E01));
+        EOi + 1 = vsubq_s32(E10, rev32(E11));
+
+        int32x4_t t0 = vreinterpretq_s32_s64(
+            vzip1q_s64(vreinterpretq_s64_s32(EE0), vreinterpretq_s64_s32(EE1)));
+        int32x4_t t1 = vrev64q_s32(vreinterpretq_s32_s64(vzip2q_s64(
+            vreinterpretq_s64_s32(EE0), vreinterpretq_s64_s32(EE1))));
 
-        /* EE and EO */
-        EE = vaddq_s32(E0, rev32(E1));
-        EO = vsubq_s32(E0, rev32(E1));
 
-        for (k = 2; k < 16; k += 4)
+        EEEi / 2 = vaddq_s32(t0, t1);
+        EEOi / 2 = vsubq_s32(t0, t1);
+    }
+
​

x265_3.6.tar.gz/source/common/aarch64/dct-prim.h -> x265_4.0.tar.gz/source/common/aarch64/dct-prim.h Changed

@@ -6,11 +6,51 @@
 #include "primitives.h"
 #include "contexts.h"   // costCoeffNxN_c
 #include "threading.h"  // CLZ
+#include <arm_neon.h>
 
 namespace X265_NS
 {
+// First two columns of the 4x4 dct transform matrix, duplicated to 4x4 to allow
+// processing two lines at once.
+const int32_t t8_even44 =
+{
+    { 64,  64, 64,  64 },
+    { 83,  36, 83,  36 },
+    { 64, -64, 64, -64 },
+    { 36, -83, 36, -83 },
+};
+
+const uint8_t rev16_tbl16 =
+{
+    14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
+};
+
+const uint8_t rev32_tbl16 =
+{
+    12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
+};
+
+static inline int16x8_t rev16(const int16x8_t a)
+{
+    const uint8x16_t tbl = vld1q_u8(rev16_tbl);
+    const int8x16_t a_s8 = vreinterpretq_s8_s16(a);
+
+    return vreinterpretq_s16_s8(vqtbx1q_s8(a_s8, a_s8, tbl));
+}
+
+static inline int32x4_t rev32(const int32x4_t a)
+{
+    const uint8x16_t tbl = vld1q_u8(rev32_tbl);
+    const int8x16_t a_s8 = vreinterpretq_s8_s32(a);
+
+    return vreinterpretq_s32_s8(vqtbx1q_s8(a_s8, a_s8, tbl));
+}
+
 // x265 private namespace
 void setupDCTPrimitives_neon(EncoderPrimitives &p);
+#if defined(HAVE_SVE) && HAVE_SVE_BRIDGE
+void setupDCTPrimitives_sve(EncoderPrimitives &p);
+#endif
 };

 
@@ -6,11 +6,51 @@
 #include "primitives.h"
 #include "contexts.h"   // costCoeffNxN_c
 #include "threading.h"  // CLZ
+#include <arm_neon.h>
 
 namespace X265_NS
 {
+// First two columns of the 4x4 dct transform matrix, duplicated to 4x4 to allow
+// processing two lines at once.
+const int32_t t8_even44 =
+{
+    { 64,  64, 64,  64 },
+    { 83,  36, 83,  36 },
+    { 64, -64, 64, -64 },
+    { 36, -83, 36, -83 },
+};
+
+const uint8_t rev16_tbl16 =
+{
+    14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
+};
+
+const uint8_t rev32_tbl16 =
+{
+    12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
+};
+
+static inline int16x8_t rev16(const int16x8_t a)
+{
+    const uint8x16_t tbl = vld1q_u8(rev16_tbl);
+    const int8x16_t a_s8 = vreinterpretq_s8_s16(a);
+
+    return vreinterpretq_s16_s8(vqtbx1q_s8(a_s8, a_s8, tbl));
+}
+
+static inline int32x4_t rev32(const int32x4_t a)
+{
+    const uint8x16_t tbl = vld1q_u8(rev32_tbl);
+    const int8x16_t a_s8 = vreinterpretq_s8_s32(a);
+
+    return vreinterpretq_s32_s8(vqtbx1q_s8(a_s8, a_s8, tbl));
+}
+
 // x265 private namespace
 void setupDCTPrimitives_neon(EncoderPrimitives &p);
+#if defined(HAVE_SVE) && HAVE_SVE_BRIDGE
+void setupDCTPrimitives_sve(EncoderPrimitives &p);
+#endif
 };
 
 
​

x265_4.0.tar.gz/source/common/aarch64/dct.S Added

@@ -0,0 +1,883 @@
+/*****************************************************************************
+ * Copyright (C) 2021 MulticoreWare, Inc
+ *
+ * Authors: Min Chen <min.chen@multicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+// Functions in this file:
+// ***** luma_vpp *****
+
+#include "asm.S"
+
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
+.section .rodata
+#endif
+
+.align 4
+
+.text
+.set idct16_shift_1, 7
+.set idct16_shift_2, 12-(BIT_DEPTH-8)
+
+.set dct16_shift_1, 3+(BIT_DEPTH-8)
+.set dct16_shift_2, 10
+
+.align 4
+// NOTE: Hardcoded due to asm syntax issue, don't reorder!
+tbl_const_idct_0:
+    .hword 64, 83, 36, 89, 75, 50, 18,  0   // v0
+    .hword 90, 87, 80, 70, 57, 43, 25,  9   // v1
+//    .hword 0=64, 1=83, 2=36, 3=89, 4=75, 5=50, 6=18, 7=00
+//    .hword 0=90, 1=87, 2=80, 3=70, 4=57, 5=43, 6=25, 7= 9
+
+    .hword 64, 83, 64, 36   // v0
+    .hword 64, 36,-64,-83
+    .hword 64,-36,-64, 83   // v1
+    .hword 64,-83, 64,-36
+
+    .hword 89, 75, 50, 18   // v2
+    .hword 75,-18,-89,-50
+    .hword 50,-89, 18, 75   // v3
+    .hword 18,-50, 75,-89
+
+    .hword 90,+87,+80,+70, +57,+43,+25,+ 9   // v4
+    .hword 87,+57, +9,-43, -80,-90,-70,-25   // v5
+    .hword 80, +9,-70,-87, -25,+57,+90,+43   // v6
+    .hword 70,-43,-87, +9, +90,+25,-80,-57   // v7
+    .hword 57,-80,-25,+90, - 9,-87,+43,+70   // v8
+    .hword 43,-90,+57,+25, -87,+70,+ 9,-80   // v9
+    .hword 25,-70,+90,-80, +43,+ 9,-57,+87   // v16
+    .hword  9,-25,+43,-57, +70,-80,+87,-90   // v17
+
+    .byte 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3  // v18
+
+tbl_const_dct_0:
+    // EE
+    .hword 64,+64,+64,+64                   // v16
+    .hword 83,+36,-36,-83                   // v17
+    .hword 64,-64,-64,+64                   // v18
+    .hword 36,-83,+83,-36                   // v19
+
+    // EO
+    .hword 89,+75,+50,+18                   // v20
+    .hword 75,-18,-89,-50                   // v21
+    .hword 50,-89,+18,+75                   // v22
+    .hword 18,-50,+75,-89                   // v23
+
+    // O
+    .hword 90,+87,+80,+70,+57,+43,+25, +9   // v24
+    .hword 87,+57, +9,-43,-80,-90,-70,-25   // v25
+    .hword 80, +9,-70,-87,-25,+57,+90,+43   // v26
+    .hword 70,-43,-87, +9,+90,+25,-80,-57   // v27
+    .hword 57,-80,-25,+90, -9,-87,+43,+70   // v28
+    .hword 43,-90,+57,+25,-87,+70, +9,-80   // v29
+    .hword 25,-70,+90,-80,+43, +9,-57,+87   // v30
+    .hword  9,-25,+43,-57,+70,-80,+87,-90   // v31
+
+    .byte 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1  // v0
+//    .byte 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9  // v1
+
+    .word 64, 83, 36, 89, 75, 50, 18,  0    // v0, v1
+    .word 90, 87, 80, 70, 57, 43, 25,  9    // v2, v3
+
+
+// ***** idct 16x16 *****
+// void idct16(const int16_t* src, int16_t* dst, intptr_t dstStride)
+function PFX(idct16_neon)
+// Register map
+// x0  = src
+// x1  = dst
+// x2  = dstStride
+// x8  = tbl_const_idct_0
+
+    stp             d8, d9, sp,#-16!
+    sub             sp, sp, #(16*16*2)
+
+    adr             x8, tbl_const_idct_0
+    ldp             q0, q1, x8
+
+    mov             x5, sp
+    mov             w4, #16
+
+    // Pass1
+5:
+    ldr             d16, x0, #(0*16*2)
+    ldr             d17, x0, #(2*16*2)
+    ldr             d18, x0, #(4*16*2)
+    ldr             d19, x0, #(6*16*2)
+    ldr             d20, x0, #(8*16*2)
+    ldr             d21, x0, #(10*16*2)
+    ldr             d22, x0, #(12*16*2)
+    ldr             d23, x0, #(14*16*2)
+
+// EEE0 = 64*src0*16+i + 64*src 8*16+i;
+// EEE1 = 64*src0*16+i - 64*src 8*16+i;
+// EEO0 = 83*src4*16+i + 36*src12*16+i;
+// EEO1 = 36*src4*16+i - 83*src12*16+i;
+    smull           v24.4s, v16.4h, v0.h0         // EEE0 = 64*0
+    smull           v26.4s, v18.4h, v0.h1         // EEO0 = 83*4
+    mov             v25.16b, v24.16b                // EEE1 = 64*0
+    smull           v27.4s, v18.4h, v0.h2         // EEO1 = 36*4
+
+// EO0 = 89*src 2*16+i + 75*src 6*16+i + 50*src10*16+i + 18*src14*16+i;
+// EO1 = 75*src 2*16+i - 18*src 6*16+i - 89*src10*16+i - 50*src14*16+i;
+// EO2 = 50*src 2*16+i - 89*src 6*16+i + 18*src10*16+i + 75*src14*16+i;
+// EO3 = 18*src 2*16+i - 50*src 6*16+i + 75*src10*16+i - 89*src14*16+i;
+    smull           v28.4s, v17.4h, v0.h3         // EO0 = 89*2
+    smull           v29.4s, v17.4h, v0.h4         // EO1 = 75*2
+    smull           v30.4s, v17.4h, v0.h5         // EO2 = 50*2
+    smull           v31.4s, v17.4h, v0.h6         // EO3 = 18*2
+
+    smlal           v28.4s, v19.4h, v0.h4         // EO0 = 89*2+75*6
+    smlsl           v29.4s, v19.4h, v0.h6         // EO1 = 75*2-18*6
+    smlsl           v30.4s, v19.4h, v0.h3         // EO2 = 50*2-89*6
+    smlsl           v31.4s, v19.4h, v0.h5         // EO3 = 18*2-50*6
+
+    ldr             d16, x0, #(1*16*2)
+    ldr             d17, x0, #(3*16*2)
+    ldr             d18, x0, #(5*16*2)
+    ldr             d19, x0, #(7*16*2)
+
+    orr             v2.8b, v20.8b, v21.8b
+    orr             v2.8b, v2.8b, v22.8b
+    orr             v2.8b, v2.8b, v23.8b
+    orr             v3.8b, v18.8b, v19.8b
+    mov             x6, v2.d0
+    mov             x7, v3.d0
+
+// O0 = 90*src 1*16+i + 87*src 3*16+i + 80*src 5*16+i + 70*src 7*16+i + 57*src 9*16+i + 43*src11*16+i + 25*src13*16+i +  9*src15*16+i;
+// O1 = 87*src 1*16+i + 57*src 3*16+i +  9*src 5*16+i - 43*src 7*16+i - 80*src 9*16+i - 90*src11*16+i - 70*src13*16+i - 25*src15*16+i;
+// O2 = 80*src 1*16+i +  9*src 3*16+i - 70*src 5*16+i - 87*src 7*16+i - 25*src 9*16+i + 57*src11*16+i + 90*src13*16+i + 43*src15*16+i;
+// O3 = 70*src 1*16+i - 43*src 3*16+i - 87*src 5*16+i +  9*src 7*16+i + 90*src 9*16+i + 25*src11*16+i - 80*src13*16+i - 57*src15*16+i;
+// O4 = 57*src 1*16+i - 80*src 3*16+i - 25*src 5*16+i + 90*src 7*16+i -  9*src 9*16+i - 87*src11*16+i + 43*src13*16+i + 70*src15*16+i;
+// O5 = 43*src 1*16+i - 90*src 3*16+i + 57*src 5*16+i + 25*src 7*16+i - 87*src 9*16+i + 70*src11*16+i +  9*src13*16+i - 80*src15*16+i;
+// O6 = 25*src 1*16+i - 70*src 3*16+i + 90*src 5*16+i - 80*src 7*16+i + 43*src 9*16+i +  9*src11*16+i - 57*src13*16+i + 87*src15*16+i;
+// O7 =  9*src 1*16+i - 25*src 3*16+i + 43*src 5*16+i - 57*src 7*16+i + 70*src 9*16+i - 80*src11*16+i + 87*src13*16+i - 90*src15*16+i;
+    smull           v2.4s, v16.4h, v1.h0          // v2 = O0 = 90*1
+    smull           v3.4s, v16.4h, v1.h1          // v3 = O1 = 87*1
+    smull           v4.4s, v16.4h, v1.h2          // v4 = O2 = 80*1
+    smull           v5.4s, v16.4h, v1.h3          // v5 = O3 = 70*1
+    smull           v6.4s, v16.4h, v1.h4          // v6 = O4 = 57*1
+    smull           v7.4s, v16.4h, v1.h5          // v7 = O5 = 43*1
+    smull           v8.4s, v16.4h, v1.h6          // v8 = O6 = 25*1
+    smull           v9.4s, v16.4h, v1.h7          // v9 = O7 =  9*1
+
+    smlal           v2.4s, v17.4h, v1.h1          // v2 = O0 = 90*1+87*3
+    smlal           v3.4s, v17.4h, v1.h4          // v3 = O1 = 87*1+57*3
+    smlal           v4.4s, v17.4h, v1.h7          // v4 = O2 = 80*1+ 9*3
+    smlsl           v5.4s, v17.4h, v1.h5          // v5 = O3 = 70*1-43*3
+    smlsl           v6.4s, v17.4h, v1.h2          // v6 = O4 = 57*1-80*3
+    smlsl           v7.4s, v17.4h, v1.h0          // v7 = O5 = 43*1-90*3
+    smlsl           v8.4s, v17.4h, v1.h3          // v8 = O6 = 25*1-70*3
+    smlsl           v9.4s, v17.4h, v1.h6          // v9 = O7 =  9*1-25*3
+
+    //cmp             x7, #0
+    //beq             1f
+    cbz             x7, 1f
+
+    smlal           v2.4s, v18.4h, v1.h2          // v2 = O0 = 90*1+87*3+80*5
+    smlal           v3.4s, v18.4h, v1.h7          // v3 = O1 = 87*1+57*3+ 9*5
+    smlsl           v4.4s, v18.4h, v1.h3          // v4 = O2 = 80*1+ 9*3-70*5

 
@@ -0,0 +1,883 @@
+/*****************************************************************************
+ * Copyright (C) 2021 MulticoreWare, Inc
+ *
+ * Authors: Min Chen <min.chen@multicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+// Functions in this file:
+// ***** luma_vpp *****
+
+#include "asm.S"
+
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
+.section .rodata
+#endif
+
+.align 4
+
+.text
+.set idct16_shift_1, 7
+.set idct16_shift_2, 12-(BIT_DEPTH-8)
+
+.set dct16_shift_1, 3+(BIT_DEPTH-8)
+.set dct16_shift_2, 10
+
+.align 4
+// NOTE: Hardcoded due to asm syntax issue, don't reorder!
+tbl_const_idct_0:
+    .hword 64, 83, 36, 89, 75, 50, 18,  0   // v0
+    .hword 90, 87, 80, 70, 57, 43, 25,  9   // v1
+//    .hword 0=64, 1=83, 2=36, 3=89, 4=75, 5=50, 6=18, 7=00
+//    .hword 0=90, 1=87, 2=80, 3=70, 4=57, 5=43, 6=25, 7= 9
+
+    .hword 64, 83, 64, 36   // v0
+    .hword 64, 36,-64,-83
+    .hword 64,-36,-64, 83   // v1
+    .hword 64,-83, 64,-36
+
+    .hword 89, 75, 50, 18   // v2
+    .hword 75,-18,-89,-50
+    .hword 50,-89, 18, 75   // v3
+    .hword 18,-50, 75,-89
+
+    .hword 90,+87,+80,+70, +57,+43,+25,+ 9   // v4
+    .hword 87,+57, +9,-43, -80,-90,-70,-25   // v5
+    .hword 80, +9,-70,-87, -25,+57,+90,+43   // v6
+    .hword 70,-43,-87, +9, +90,+25,-80,-57   // v7
+    .hword 57,-80,-25,+90, - 9,-87,+43,+70   // v8
+    .hword 43,-90,+57,+25, -87,+70,+ 9,-80   // v9
+    .hword 25,-70,+90,-80, +43,+ 9,-57,+87   // v16
+    .hword  9,-25,+43,-57, +70,-80,+87,-90   // v17
+
+    .byte 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3  // v18
+
+tbl_const_dct_0:
+    // EE
+    .hword 64,+64,+64,+64                   // v16
+    .hword 83,+36,-36,-83                   // v17
+    .hword 64,-64,-64,+64                   // v18
+    .hword 36,-83,+83,-36                   // v19
+
+    // EO
+    .hword 89,+75,+50,+18                   // v20
+    .hword 75,-18,-89,-50                   // v21
+    .hword 50,-89,+18,+75                   // v22
+    .hword 18,-50,+75,-89                   // v23
+
+    // O
+    .hword 90,+87,+80,+70,+57,+43,+25, +9   // v24
+    .hword 87,+57, +9,-43,-80,-90,-70,-25   // v25
+    .hword 80, +9,-70,-87,-25,+57,+90,+43   // v26
+    .hword 70,-43,-87, +9,+90,+25,-80,-57   // v27
+    .hword 57,-80,-25,+90, -9,-87,+43,+70   // v28
+    .hword 43,-90,+57,+25,-87,+70, +9,-80   // v29
+    .hword 25,-70,+90,-80,+43, +9,-57,+87   // v30
+    .hword  9,-25,+43,-57,+70,-80,+87,-90   // v31
+
+    .byte 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1  // v0
+//    .byte 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9  // v1
+
+    .word 64, 83, 36, 89, 75, 50, 18,  0    // v0, v1
+    .word 90, 87, 80, 70, 57, 43, 25,  9    // v2, v3
+
+
+// ***** idct 16x16 *****
+// void idct16(const int16_t* src, int16_t* dst, intptr_t dstStride)
+function PFX(idct16_neon)
+// Register map
+// x0  = src
+// x1  = dst
+// x2  = dstStride
+// x8  = tbl_const_idct_0
+
+    stp             d8, d9, sp,#-16!
+    sub             sp, sp, #(16*16*2)
+
+    adr             x8, tbl_const_idct_0
+    ldp             q0, q1, x8
+
+    mov             x5, sp
+    mov             w4, #16
+
+    // Pass1
+5:
+    ldr             d16, x0, #(0*16*2)
+    ldr             d17, x0, #(2*16*2)
+    ldr             d18, x0, #(4*16*2)
+    ldr             d19, x0, #(6*16*2)
+    ldr             d20, x0, #(8*16*2)
+    ldr             d21, x0, #(10*16*2)
+    ldr             d22, x0, #(12*16*2)
+    ldr             d23, x0, #(14*16*2)
+
+// EEE0 = 64*src0*16+i + 64*src 8*16+i;
+// EEE1 = 64*src0*16+i - 64*src 8*16+i;
+// EEO0 = 83*src4*16+i + 36*src12*16+i;
+// EEO1 = 36*src4*16+i - 83*src12*16+i;
+    smull           v24.4s, v16.4h, v0.h0         // EEE0 = 64*0
+    smull           v26.4s, v18.4h, v0.h1         // EEO0 = 83*4
+    mov             v25.16b, v24.16b                // EEE1 = 64*0
+    smull           v27.4s, v18.4h, v0.h2         // EEO1 = 36*4
+
+// EO0 = 89*src 2*16+i + 75*src 6*16+i + 50*src10*16+i + 18*src14*16+i;
+// EO1 = 75*src 2*16+i - 18*src 6*16+i - 89*src10*16+i - 50*src14*16+i;
+// EO2 = 50*src 2*16+i - 89*src 6*16+i + 18*src10*16+i + 75*src14*16+i;
+// EO3 = 18*src 2*16+i - 50*src 6*16+i + 75*src10*16+i - 89*src14*16+i;
+    smull           v28.4s, v17.4h, v0.h3         // EO0 = 89*2
+    smull           v29.4s, v17.4h, v0.h4         // EO1 = 75*2
+    smull           v30.4s, v17.4h, v0.h5         // EO2 = 50*2
+    smull           v31.4s, v17.4h, v0.h6         // EO3 = 18*2
+
+    smlal           v28.4s, v19.4h, v0.h4         // EO0 = 89*2+75*6
+    smlsl           v29.4s, v19.4h, v0.h6         // EO1 = 75*2-18*6
+    smlsl           v30.4s, v19.4h, v0.h3         // EO2 = 50*2-89*6
+    smlsl           v31.4s, v19.4h, v0.h5         // EO3 = 18*2-50*6
+
+    ldr             d16, x0, #(1*16*2)
+    ldr             d17, x0, #(3*16*2)
+    ldr             d18, x0, #(5*16*2)
+    ldr             d19, x0, #(7*16*2)
+
+    orr             v2.8b, v20.8b, v21.8b
+    orr             v2.8b, v2.8b, v22.8b
+    orr             v2.8b, v2.8b, v23.8b
+    orr             v3.8b, v18.8b, v19.8b
+    mov             x6, v2.d0
+    mov             x7, v3.d0
+
+// O0 = 90*src 1*16+i + 87*src 3*16+i + 80*src 5*16+i + 70*src 7*16+i + 57*src 9*16+i + 43*src11*16+i + 25*src13*16+i +  9*src15*16+i;
+// O1 = 87*src 1*16+i + 57*src 3*16+i +  9*src 5*16+i - 43*src 7*16+i - 80*src 9*16+i - 90*src11*16+i - 70*src13*16+i - 25*src15*16+i;
+// O2 = 80*src 1*16+i +  9*src 3*16+i - 70*src 5*16+i - 87*src 7*16+i - 25*src 9*16+i + 57*src11*16+i + 90*src13*16+i + 43*src15*16+i;
+// O3 = 70*src 1*16+i - 43*src 3*16+i - 87*src 5*16+i +  9*src 7*16+i + 90*src 9*16+i + 25*src11*16+i - 80*src13*16+i - 57*src15*16+i;
+// O4 = 57*src 1*16+i - 80*src 3*16+i - 25*src 5*16+i + 90*src 7*16+i -  9*src 9*16+i - 87*src11*16+i + 43*src13*16+i + 70*src15*16+i;
+// O5 = 43*src 1*16+i - 90*src 3*16+i + 57*src 5*16+i + 25*src 7*16+i - 87*src 9*16+i + 70*src11*16+i +  9*src13*16+i - 80*src15*16+i;
+// O6 = 25*src 1*16+i - 70*src 3*16+i + 90*src 5*16+i - 80*src 7*16+i + 43*src 9*16+i +  9*src11*16+i - 57*src13*16+i + 87*src15*16+i;
+// O7 =  9*src 1*16+i - 25*src 3*16+i + 43*src 5*16+i - 57*src 7*16+i + 70*src 9*16+i - 80*src11*16+i + 87*src13*16+i - 90*src15*16+i;
+    smull           v2.4s, v16.4h, v1.h0          // v2 = O0 = 90*1
+    smull           v3.4s, v16.4h, v1.h1          // v3 = O1 = 87*1
+    smull           v4.4s, v16.4h, v1.h2          // v4 = O2 = 80*1
+    smull           v5.4s, v16.4h, v1.h3          // v5 = O3 = 70*1
+    smull           v6.4s, v16.4h, v1.h4          // v6 = O4 = 57*1
+    smull           v7.4s, v16.4h, v1.h5          // v7 = O5 = 43*1
+    smull           v8.4s, v16.4h, v1.h6          // v8 = O6 = 25*1
+    smull           v9.4s, v16.4h, v1.h7          // v9 = O7 =  9*1
+
+    smlal           v2.4s, v17.4h, v1.h1          // v2 = O0 = 90*1+87*3
+    smlal           v3.4s, v17.4h, v1.h4          // v3 = O1 = 87*1+57*3
+    smlal           v4.4s, v17.4h, v1.h7          // v4 = O2 = 80*1+ 9*3
+    smlsl           v5.4s, v17.4h, v1.h5          // v5 = O3 = 70*1-43*3
+    smlsl           v6.4s, v17.4h, v1.h2          // v6 = O4 = 57*1-80*3
+    smlsl           v7.4s, v17.4h, v1.h0          // v7 = O5 = 43*1-90*3
+    smlsl           v8.4s, v17.4h, v1.h3          // v8 = O6 = 25*1-70*3
+    smlsl           v9.4s, v17.4h, v1.h6          // v9 = O7 =  9*1-25*3
+
+    //cmp             x7, #0
+    //beq             1f
+    cbz             x7, 1f
+
+    smlal           v2.4s, v18.4h, v1.h2          // v2 = O0 = 90*1+87*3+80*5
+    smlal           v3.4s, v18.4h, v1.h7          // v3 = O1 = 87*1+57*3+ 9*5
+    smlsl           v4.4s, v18.4h, v1.h3          // v4 = O2 = 80*1+ 9*3-70*5
​

x265_4.0.tar.gz/source/common/aarch64/filter-neon-dotprod.cpp Added

@@ -0,0 +1,1131 @@
+/*****************************************************************************
+ * Copyright (C) 2024 MulticoreWare, Inc
+ *
+ * Authors: Hari Limaye <hari.limaye@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "filter-neon-dotprod.h"
+
+#if !HIGH_BIT_DEPTH
+#include "mem-neon.h"
+#include <arm_neon.h>
+
+namespace {
+static const uint8_t dotprod_permute_tbl48 = {
+    0, 1,  2,  3, 1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5, 6,
+    4, 5,  6,  7, 5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10,
+    8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+};
+
+static const uint8_t dot_prod_merge_block_tbl48 = {
+    // Shift left and insert new last column in transposed 4x4 block.
+    1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
+    // Shift left and insert two new columns in transposed 4x4 block.
+    2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
+    // Shift left and insert three new columns in transposed 4x4 block.
+    3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
+};
+
+uint8x8_t inline filter8_8_pp(uint8x16_t samples, const int8x8_t filter,
+                              const int32x4_t constant, const uint8x16x3_t tbl)
+{
+    // Transform sample range from uint8_t to int8_t for signed dot product.
+    int8x16_t samples_s8 =
+        vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
+
+    // Permute input samples for dot product.
+    // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+    int8x16_t perm_samples_0 = vqtbl1q_s8(samples_s8, tbl.val0);
+    // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+    int8x16_t perm_samples_1 = vqtbl1q_s8(samples_s8, tbl.val1);
+    // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+    int8x16_t perm_samples_2 = vqtbl1q_s8(samples_s8, tbl.val2);
+
+    int32x4_t dotprod_lo = vdotq_lane_s32(constant, perm_samples_0, filter, 0);
+    int32x4_t dotprod_hi = vdotq_lane_s32(constant, perm_samples_1, filter, 0);
+    dotprod_lo = vdotq_lane_s32(dotprod_lo, perm_samples_1, filter, 1);
+    dotprod_hi = vdotq_lane_s32(dotprod_hi, perm_samples_2, filter, 1);
+
+    // Narrow and combine.
+    int16x8_t dotprod = vcombine_s16(vmovn_s32(dotprod_lo),
+                                     vmovn_s32(dotprod_hi));
+    return vqrshrun_n_s16(dotprod, IF_FILTER_PREC);
+}
+
+void inline init_sample_permute(uint8x8_t *samples, const uint8x16x3_t tbl,
+                                int8x16_t *d)
+{
+    // Transform sample range from uint8_t to int8_t for signed dot product.
+    int8x8_t samples_s84;
+    samples_s80 = vreinterpret_s8_u8(vsub_u8(samples0, vdup_n_u8(128)));
+    samples_s81 = vreinterpret_s8_u8(vsub_u8(samples1, vdup_n_u8(128)));
+    samples_s82 = vreinterpret_s8_u8(vsub_u8(samples2, vdup_n_u8(128)));
+    samples_s83 = vreinterpret_s8_u8(vsub_u8(samples3, vdup_n_u8(128)));
+
+    // Permute input samples for dot product.
+    // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+    d0 = vqtbl1q_s8(vcombine_s8(samples_s80, vdup_n_s8(0)), tbl.val0);
+    d1 = vqtbl1q_s8(vcombine_s8(samples_s81, vdup_n_s8(0)), tbl.val0);
+    d2 = vqtbl1q_s8(vcombine_s8(samples_s82, vdup_n_s8(0)), tbl.val0);
+    d3 = vqtbl1q_s8(vcombine_s8(samples_s83, vdup_n_s8(0)), tbl.val0);
+}
+
+uint8x8_t inline filter8_8_pp_reuse(uint8x16_t samples, const int8x8_t filter,
+                                    const int32x4_t constant,
+                                    const uint8x16x3_t tbl,
+                                    int8x16_t &perm_samples_0)
+{
+    // Transform sample range from uint8_t to int8_t for signed dot product.
+    int8x16_t samples_s8 =
+        vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
+
+    // Permute input samples for dot product.
+    // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+    // Already in perm_samples_0.
+    // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+    int8x16_t perm_samples_1 = vqtbl1q_s8(samples_s8, tbl.val1);
+    // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+    int8x16_t perm_samples_2 = vqtbl1q_s8(samples_s8, tbl.val2);
+
+    int32x4_t dotprod_lo = vdotq_lane_s32(constant, perm_samples_0, filter, 0);
+    int32x4_t dotprod_hi = vdotq_lane_s32(constant, perm_samples_1, filter, 0);
+    dotprod_lo = vdotq_lane_s32(dotprod_lo, perm_samples_1, filter, 1);
+    dotprod_hi = vdotq_lane_s32(dotprod_hi, perm_samples_2, filter, 1);
+
+    // Save for re-use in next iteration.
+    perm_samples_0 = perm_samples_2;
+
+    // Narrow and combine.
+    int16x8_t dotprod = vcombine_s16(vmovn_s32(dotprod_lo),
+                                     vmovn_s32(dotprod_hi));
+    return vqrshrun_n_s16(dotprod, IF_FILTER_PREC);
+}
+
+int16x4_t inline filter8_4_ps(uint8x16_t samples, const int8x8_t filter,
+                              const uint8x16x3_t tbl)
+{
+    // Transform sample range from uint8_t to int8_t for signed dot product.
+    int8x16_t samples_s8 =
+        vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
+
+    // Permute input samples for dot product.
+    // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+    int8x16_t perm_samples_0 = vqtbl1q_s8(samples_s8, tbl.val0);
+    // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+    int8x16_t perm_samples_1 = vqtbl1q_s8(samples_s8, tbl.val1);
+
+    // Correction accounting for sample range transform cancels to 0.
+    int32x4_t constant = vdupq_n_s32(0);
+    int32x4_t dotprod = vdotq_lane_s32(constant, perm_samples_0, filter, 0);
+    dotprod = vdotq_lane_s32(dotprod, perm_samples_1, filter, 1);
+
+    // Narrow.
+    return vmovn_s32(dotprod);
+}
+
+int16x8_t inline filter8_8_ps(uint8x16_t samples, const int8x8_t filter,
+                              const uint8x16x3_t tbl)
+{
+    // Transform sample range from uint8_t to int8_t for signed dot product.
+    int8x16_t samples_s8 =
+        vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
+
+    // Permute input samples for dot product.
+    // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+    int8x16_t perm_samples_0 = vqtbl1q_s8(samples_s8, tbl.val0);
+    // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+    int8x16_t perm_samples_1 = vqtbl1q_s8(samples_s8, tbl.val1);
+    // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+    int8x16_t perm_samples_2 = vqtbl1q_s8(samples_s8, tbl.val2);
+
+    // Correction accounting for sample range transform cancels to 0.
+    int32x4_t constant = vdupq_n_s32(0);
+    int32x4_t dotprod_lo = vdotq_lane_s32(constant, perm_samples_0, filter, 0);
+    int32x4_t dotprod_hi = vdotq_lane_s32(constant, perm_samples_1, filter, 0);
+    dotprod_lo = vdotq_lane_s32(dotprod_lo, perm_samples_1, filter, 1);
+    dotprod_hi = vdotq_lane_s32(dotprod_hi, perm_samples_2, filter, 1);
+
+    // Narrow and combine.
+    return vcombine_s16(vmovn_s32(dotprod_lo), vmovn_s32(dotprod_hi));
+}
+
+int16x8_t inline filter8_8_ps_reuse(uint8x16_t samples, const int8x8_t filter,
+                                    const uint8x16x3_t tbl,
+                                    int8x16_t &perm_samples_0)
+{
+    // Transform sample range from uint8_t to int8_t for signed dot product.
+    int8x16_t samples_s8 =
+        vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
+
+    // Permute input samples for dot product.
+    // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+    // Already in perm_samples_0.
+    // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+    int8x16_t perm_samples_1 = vqtbl1q_s8(samples_s8, tbl.val1);
+    // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+    int8x16_t perm_samples_2 = vqtbl1q_s8(samples_s8, tbl.val2);
+
+    // Correction accounting for sample range transform cancels to 0.
+    int32x4_t constant = vdupq_n_s32(0);
+    int32x4_t dotprod_lo = vdotq_lane_s32(constant, perm_samples_0, filter, 0);
+    int32x4_t dotprod_hi = vdotq_lane_s32(constant, perm_samples_1, filter, 0);
+    dotprod_lo = vdotq_lane_s32(dotprod_lo, perm_samples_1, filter, 1);
+    dotprod_hi = vdotq_lane_s32(dotprod_hi, perm_samples_2, filter, 1);
+
+    // Save for re-use in next iteration.
+    perm_samples_0 = perm_samples_2;
+
+    // Narrow and combine.
+    return vcombine_s16(vmovn_s32(dotprod_lo), vmovn_s32(dotprod_hi));
+}
+
+uint8x8_t inline filter4_8_pp(uint8x16_t samples, const int8x8_t filter,

 
@@ -0,0 +1,1131 @@
+/*****************************************************************************
+ * Copyright (C) 2024 MulticoreWare, Inc
+ *
+ * Authors: Hari Limaye <hari.limaye@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "filter-neon-dotprod.h"
+
+#if !HIGH_BIT_DEPTH
+#include "mem-neon.h"
+#include <arm_neon.h>
+
+namespace {
+static const uint8_t dotprod_permute_tbl48 = {
+    0, 1,  2,  3, 1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5, 6,
+    4, 5,  6,  7, 5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10,
+    8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+};
+
+static const uint8_t dot_prod_merge_block_tbl48 = {
+    // Shift left and insert new last column in transposed 4x4 block.
+    1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
+    // Shift left and insert two new columns in transposed 4x4 block.
+    2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
+    // Shift left and insert three new columns in transposed 4x4 block.
+    3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
+};
+
+uint8x8_t inline filter8_8_pp(uint8x16_t samples, const int8x8_t filter,
+                              const int32x4_t constant, const uint8x16x3_t tbl)
+{
+    // Transform sample range from uint8_t to int8_t for signed dot product.
+    int8x16_t samples_s8 =
+        vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
+
+    // Permute input samples for dot product.
+    // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+    int8x16_t perm_samples_0 = vqtbl1q_s8(samples_s8, tbl.val0);
+    // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+    int8x16_t perm_samples_1 = vqtbl1q_s8(samples_s8, tbl.val1);
+    // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+    int8x16_t perm_samples_2 = vqtbl1q_s8(samples_s8, tbl.val2);
+
+    int32x4_t dotprod_lo = vdotq_lane_s32(constant, perm_samples_0, filter, 0);
+    int32x4_t dotprod_hi = vdotq_lane_s32(constant, perm_samples_1, filter, 0);
+    dotprod_lo = vdotq_lane_s32(dotprod_lo, perm_samples_1, filter, 1);
+    dotprod_hi = vdotq_lane_s32(dotprod_hi, perm_samples_2, filter, 1);
+
+    // Narrow and combine.
+    int16x8_t dotprod = vcombine_s16(vmovn_s32(dotprod_lo),
+                                     vmovn_s32(dotprod_hi));
+    return vqrshrun_n_s16(dotprod, IF_FILTER_PREC);
+}
+
+void inline init_sample_permute(uint8x8_t *samples, const uint8x16x3_t tbl,
+                                int8x16_t *d)
+{
+    // Transform sample range from uint8_t to int8_t for signed dot product.
+    int8x8_t samples_s84;
+    samples_s80 = vreinterpret_s8_u8(vsub_u8(samples0, vdup_n_u8(128)));
+    samples_s81 = vreinterpret_s8_u8(vsub_u8(samples1, vdup_n_u8(128)));
+    samples_s82 = vreinterpret_s8_u8(vsub_u8(samples2, vdup_n_u8(128)));
+    samples_s83 = vreinterpret_s8_u8(vsub_u8(samples3, vdup_n_u8(128)));
+
+    // Permute input samples for dot product.
+    // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+    d0 = vqtbl1q_s8(vcombine_s8(samples_s80, vdup_n_s8(0)), tbl.val0);
+    d1 = vqtbl1q_s8(vcombine_s8(samples_s81, vdup_n_s8(0)), tbl.val0);
+    d2 = vqtbl1q_s8(vcombine_s8(samples_s82, vdup_n_s8(0)), tbl.val0);
+    d3 = vqtbl1q_s8(vcombine_s8(samples_s83, vdup_n_s8(0)), tbl.val0);
+}
+
+uint8x8_t inline filter8_8_pp_reuse(uint8x16_t samples, const int8x8_t filter,
+                                    const int32x4_t constant,
+                                    const uint8x16x3_t tbl,
+                                    int8x16_t &perm_samples_0)
+{
+    // Transform sample range from uint8_t to int8_t for signed dot product.
+    int8x16_t samples_s8 =
+        vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
+
+    // Permute input samples for dot product.
+    // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+    // Already in perm_samples_0.
+    // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+    int8x16_t perm_samples_1 = vqtbl1q_s8(samples_s8, tbl.val1);
+    // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+    int8x16_t perm_samples_2 = vqtbl1q_s8(samples_s8, tbl.val2);
+
+    int32x4_t dotprod_lo = vdotq_lane_s32(constant, perm_samples_0, filter, 0);
+    int32x4_t dotprod_hi = vdotq_lane_s32(constant, perm_samples_1, filter, 0);
+    dotprod_lo = vdotq_lane_s32(dotprod_lo, perm_samples_1, filter, 1);
+    dotprod_hi = vdotq_lane_s32(dotprod_hi, perm_samples_2, filter, 1);
+
+    // Save for re-use in next iteration.
+    perm_samples_0 = perm_samples_2;
+
+    // Narrow and combine.
+    int16x8_t dotprod = vcombine_s16(vmovn_s32(dotprod_lo),
+                                     vmovn_s32(dotprod_hi));
+    return vqrshrun_n_s16(dotprod, IF_FILTER_PREC);
+}
+
+int16x4_t inline filter8_4_ps(uint8x16_t samples, const int8x8_t filter,
+                              const uint8x16x3_t tbl)
+{
+    // Transform sample range from uint8_t to int8_t for signed dot product.
+    int8x16_t samples_s8 =
+        vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
+
+    // Permute input samples for dot product.
+    // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+    int8x16_t perm_samples_0 = vqtbl1q_s8(samples_s8, tbl.val0);
+    // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+    int8x16_t perm_samples_1 = vqtbl1q_s8(samples_s8, tbl.val1);
+
+    // Correction accounting for sample range transform cancels to 0.
+    int32x4_t constant = vdupq_n_s32(0);
+    int32x4_t dotprod = vdotq_lane_s32(constant, perm_samples_0, filter, 0);
+    dotprod = vdotq_lane_s32(dotprod, perm_samples_1, filter, 1);
+
+    // Narrow.
+    return vmovn_s32(dotprod);
+}
+
+int16x8_t inline filter8_8_ps(uint8x16_t samples, const int8x8_t filter,
+                              const uint8x16x3_t tbl)
+{
+    // Transform sample range from uint8_t to int8_t for signed dot product.
+    int8x16_t samples_s8 =
+        vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
+
+    // Permute input samples for dot product.
+    // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+    int8x16_t perm_samples_0 = vqtbl1q_s8(samples_s8, tbl.val0);
+    // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+    int8x16_t perm_samples_1 = vqtbl1q_s8(samples_s8, tbl.val1);
+    // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+    int8x16_t perm_samples_2 = vqtbl1q_s8(samples_s8, tbl.val2);
+
+    // Correction accounting for sample range transform cancels to 0.
+    int32x4_t constant = vdupq_n_s32(0);
+    int32x4_t dotprod_lo = vdotq_lane_s32(constant, perm_samples_0, filter, 0);
+    int32x4_t dotprod_hi = vdotq_lane_s32(constant, perm_samples_1, filter, 0);
+    dotprod_lo = vdotq_lane_s32(dotprod_lo, perm_samples_1, filter, 1);
+    dotprod_hi = vdotq_lane_s32(dotprod_hi, perm_samples_2, filter, 1);
+
+    // Narrow and combine.
+    return vcombine_s16(vmovn_s32(dotprod_lo), vmovn_s32(dotprod_hi));
+}
+
+int16x8_t inline filter8_8_ps_reuse(uint8x16_t samples, const int8x8_t filter,
+                                    const uint8x16x3_t tbl,
+                                    int8x16_t &perm_samples_0)
+{
+    // Transform sample range from uint8_t to int8_t for signed dot product.
+    int8x16_t samples_s8 =
+        vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
+
+    // Permute input samples for dot product.
+    // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+    // Already in perm_samples_0.
+    // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+    int8x16_t perm_samples_1 = vqtbl1q_s8(samples_s8, tbl.val1);
+    // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+    int8x16_t perm_samples_2 = vqtbl1q_s8(samples_s8, tbl.val2);
+
+    // Correction accounting for sample range transform cancels to 0.
+    int32x4_t constant = vdupq_n_s32(0);
+    int32x4_t dotprod_lo = vdotq_lane_s32(constant, perm_samples_0, filter, 0);
+    int32x4_t dotprod_hi = vdotq_lane_s32(constant, perm_samples_1, filter, 0);
+    dotprod_lo = vdotq_lane_s32(dotprod_lo, perm_samples_1, filter, 1);
+    dotprod_hi = vdotq_lane_s32(dotprod_hi, perm_samples_2, filter, 1);
+
+    // Save for re-use in next iteration.
+    perm_samples_0 = perm_samples_2;
+
+    // Narrow and combine.
+    return vcombine_s16(vmovn_s32(dotprod_lo), vmovn_s32(dotprod_hi));
+}
+
+uint8x8_t inline filter4_8_pp(uint8x16_t samples, const int8x8_t filter,
​

x265_4.0.tar.gz/source/common/aarch64/filter-neon-dotprod.h Added

@@ -0,0 +1,37 @@
+/*****************************************************************************
+ * Copyright (C) 2024 MulticoreWare, Inc
+ *
+ * Authors: Hari Limaye <hari.limaye@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_COMMON_AARCH64_FILTER_NEON_DOTPROD_H
+#define X265_COMMON_AARCH64_FILTER_NEON_DOTPROD_H
+
+#if defined(HAVE_NEON_DOTPROD)
+
+#include "primitives.h"
+
+namespace X265_NS {
+void setupFilterPrimitives_neon_dotprod(EncoderPrimitives &p);
+}
+
+#endif // defined(HAVE_NEON_DOTPROD)
+
+#endif // X265_COMMON_AARCH64_FILTER_NEON_DOTPROD_H

 
@@ -0,0 +1,37 @@
+/*****************************************************************************
+ * Copyright (C) 2024 MulticoreWare, Inc
+ *
+ * Authors: Hari Limaye <hari.limaye@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_COMMON_AARCH64_FILTER_NEON_DOTPROD_H
+#define X265_COMMON_AARCH64_FILTER_NEON_DOTPROD_H
+
+#if defined(HAVE_NEON_DOTPROD)
+
+#include "primitives.h"
+
+namespace X265_NS {
+void setupFilterPrimitives_neon_dotprod(EncoderPrimitives &p);
+}
+
+#endif // defined(HAVE_NEON_DOTPROD)
+
+#endif // X265_COMMON_AARCH64_FILTER_NEON_DOTPROD_H
​

x265_4.0.tar.gz/source/common/aarch64/filter-neon-i8mm.cpp Added

@@ -0,0 +1,1412 @@
+/*****************************************************************************
+ * Copyright (C) 2024 MulticoreWare, Inc
+ *
+ * Authors: Hari Limaye <hari.limaye@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#if defined(HAVE_NEON_I8MM)
+#include "filter-neon-i8mm.h"
+#if !HIGH_BIT_DEPTH
+
+#include "mem-neon.h"
+
+#include <arm_neon.h>
+
+namespace {
+static const uint8_t dotprod_permute_tbl48 = {
+    0, 1,  2,  3, 1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5, 6,
+    4, 5,  6,  7, 5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10,
+    8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+};
+
+static const uint8_t matmul_permute_tbl232 = {
+    // Permute for luma filter 3.
+    { 0,  1,  2,  3,  4,  5,  6,  7,  2,  3,  4,  5,  6,  7,  8,  9,
+      4,  5,  6,  7,  8,  9, 10, 11,  6,  7,  8,  9, 10, 11, 12, 13 },
+    // Permute for luma filter 1.
+    { 1,  2,  3,  4,  5,  6,  7,  8,  3,  4,  5,  6,  7,  8,  9, 10,
+      5,  6,  7,  8,  9, 10, 11, 12,  7,  8,  9, 10, 11, 12, 13, 14 }
+};
+
+static const int8_t matmul_luma_filter216 = {
+    { -1, 4, -10, 58, 17, -5, 1, 0, 0, -1, 4, -10, 58, 17, -5, 1 },
+    { 1, -5, 17, 58, -10, 4, -1, 0, 0, 1, -5, 17, 58, -10, 4, -1 }
+};
+
+static const uint8_t dot_prod_merge_block_tbl48 = {
+    // Shift left and insert new last column in transposed 4x4 block.
+    1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
+    // Shift left and insert two new columns in transposed 4x4 block.
+    2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
+    // Shift left and insert three new columns in transposed 4x4 block.
+    3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
+};
+
+uint8x8_t inline filter8_8_pp(uint8x16_t samples, const int8x8_t filter,
+                              const uint8x16x3_t tbl)
+{
+    // Permute input samples for dot product.
+    // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+    uint8x16_t perm_s0 = vqtbl1q_u8(samples, tbl.val0);
+    // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+    uint8x16_t perm_s1 = vqtbl1q_u8(samples, tbl.val1);
+    // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+    uint8x16_t perm_S2 = vqtbl1q_u8(samples, tbl.val2);
+
+    int32x4_t dotprod_lo = vusdotq_lane_s32(vdupq_n_s32(0), perm_s0, filter, 0);
+    dotprod_lo = vusdotq_lane_s32(dotprod_lo, perm_s1, filter, 1);
+    int32x4_t dotprod_hi = vusdotq_lane_s32(vdupq_n_s32(0), perm_s1, filter, 0);
+    dotprod_hi = vusdotq_lane_s32(dotprod_hi, perm_S2, filter, 1);
+
+    // Narrow and combine.
+    int16x8_t dotprod = vcombine_s16(vmovn_s32(dotprod_lo),
+                                     vmovn_s32(dotprod_hi));
+    return vqrshrun_n_s16(dotprod, IF_FILTER_PREC);
+}
+
+void inline init_sample_permute(uint8x8_t *samples, const uint8x16x3_t tbl,
+                                uint8x16_t *d)
+{
+    // Permute input samples for dot product.
+    // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+    d0 = vqtbl1q_u8(vcombine_u8(samples0, vdup_n_u8(0)), tbl.val0);
+    d1 = vqtbl1q_u8(vcombine_u8(samples1, vdup_n_u8(0)), tbl.val0);
+    d2 = vqtbl1q_u8(vcombine_u8(samples2, vdup_n_u8(0)), tbl.val0);
+    d3 = vqtbl1q_u8(vcombine_u8(samples3, vdup_n_u8(0)), tbl.val0);
+}
+
+uint8x8_t inline filter8_8_pp_reuse(uint8x16_t samples, const int8x8_t filter,
+                                    const uint8x16x3_t tbl, uint8x16_t &perm_s0)
+{
+    // Permute input samples for dot product.
+    // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+    // Already in perm_s0.
+    // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+    uint8x16_t perm_s1 = vqtbl1q_u8(samples, tbl.val1);
+    // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+    uint8x16_t perm_s2 = vqtbl1q_u8(samples, tbl.val2);
+
+    int32x4_t dotprod_lo = vusdotq_lane_s32(vdupq_n_s32(0), perm_s0, filter, 0);
+    dotprod_lo = vusdotq_lane_s32(dotprod_lo, perm_s1, filter, 1);
+    int32x4_t dotprod_hi = vusdotq_lane_s32(vdupq_n_s32(0), perm_s1, filter, 0);
+    dotprod_hi = vusdotq_lane_s32(dotprod_hi, perm_s2, filter, 1);
+
+    // Save for re-use in next iteration.
+    perm_s0 = perm_s2;
+
+    // Narrow and combine.
+    int16x8_t dotprod = vcombine_s16(vmovn_s32(dotprod_lo),
+                                     vmovn_s32(dotprod_hi));
+    return vqrshrun_n_s16(dotprod, IF_FILTER_PREC);
+}
+
+uint8x8_t inline filter8_8_pp_matmul(uint8x16_t samples, const int8x16_t filter,
+                                     const uint8x16x2_t tbl)
+{
+    // Permute input samples for 8x2 by 2x8 matrix multiply.
+    uint8x16_t perm_s0 = vqtbl1q_u8(samples, tbl.val0);
+    uint8x16_t perm_s1 = vqtbl1q_u8(samples, tbl.val1);
+
+    int32x4_t matmul_lo = vusmmlaq_s32(vdupq_n_s32(0), perm_s0, filter);
+    int32x4_t matmul_hi = vusmmlaq_s32(vdupq_n_s32(0), perm_s1, filter);
+
+    // Narrow and combine.
+    int16x8_t matmul = vcombine_s16(vmovn_s32(matmul_lo), vmovn_s32(matmul_hi));
+    return vqrshrun_n_s16(matmul, IF_FILTER_PREC);
+}
+
+int16x4_t inline filter8_4_ps(uint8x16_t samples, const int8x8_t filter,
+                              const int16x8_t constant, const uint8x16x3_t tbl)
+{
+    // Permute input samples for dot product.
+    // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+    uint8x16_t perm_s0 = vqtbl1q_u8(samples, tbl.val0);
+    // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+    uint8x16_t perm_s1 = vqtbl1q_u8(samples, tbl.val1);
+
+    int32x4_t dotprod = vusdotq_lane_s32(vdupq_n_s32(0), perm_s0, filter, 0);
+    dotprod = vusdotq_lane_s32(dotprod, perm_s1, filter, 1);
+
+    // Narrow.
+    return vadd_s16(vmovn_s32(dotprod), vget_low_s16(constant));
+}
+
+int16x8_t inline filter8_8_ps(uint8x16_t samples, const int8x8_t filter,
+                              const int16x8_t constant, const uint8x16x3_t tbl)
+{
+    // Permute input samples for dot product.
+    // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+    uint8x16_t perm_s0 = vqtbl1q_u8(samples, tbl.val0);
+    // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+    uint8x16_t perm_s1 = vqtbl1q_u8(samples, tbl.val1);
+    // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+    uint8x16_t perm_S2 = vqtbl1q_u8(samples, tbl.val2);
+
+    int32x4_t dotprod_lo = vusdotq_lane_s32(vdupq_n_s32(0), perm_s0, filter, 0);
+    dotprod_lo = vusdotq_lane_s32(dotprod_lo, perm_s1, filter, 1);
+    int32x4_t dotprod_hi = vusdotq_lane_s32(vdupq_n_s32(0), perm_s1, filter, 0);
+    dotprod_hi = vusdotq_lane_s32(dotprod_hi, perm_S2, filter, 1);
+
+    // Narrow and combine.
+    int16x8_t dotprod = vcombine_s16(vmovn_s32(dotprod_lo),
+                                     vmovn_s32(dotprod_hi));
+    return vaddq_s16(dotprod, constant);
+}
+
+int16x8_t inline filter8_8_ps_reuse(uint8x16_t samples, const int8x8_t filter,
+                                    const int16x8_t constant,
+                                    const uint8x16x3_t tbl, uint8x16_t &perm_s0)
+{
+    // Permute input samples for dot product.
+    // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+    // Already in perm_s0.
+    // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+    uint8x16_t perm_s1 = vqtbl1q_u8(samples, tbl.val1);
+    // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+    uint8x16_t perm_s2 = vqtbl1q_u8(samples, tbl.val2);
+
+    int32x4_t dotprod_lo = vusdotq_lane_s32(vdupq_n_s32(0), perm_s0, filter, 0);
+    dotprod_lo = vusdotq_lane_s32(dotprod_lo, perm_s1, filter, 1);
+    int32x4_t dotprod_hi = vusdotq_lane_s32(vdupq_n_s32(0), perm_s1, filter, 0);
+    dotprod_hi = vusdotq_lane_s32(dotprod_hi, perm_s2, filter, 1);
+
+    // Save for re-use in next iteration.
+    perm_s0 = perm_s2;
+
+    // Narrow and combine.
+    int16x8_t dotprod = vcombine_s16(vmovn_s32(dotprod_lo),
+                                     vmovn_s32(dotprod_hi));
+    return vaddq_s16(dotprod, constant);
+}
+
+int16x8_t inline filter8_8_ps_matmul(uint8x16_t samples, const int8x16_t filter,

 
@@ -0,0 +1,1412 @@
+/*****************************************************************************
+ * Copyright (C) 2024 MulticoreWare, Inc
+ *
+ * Authors: Hari Limaye <hari.limaye@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#if defined(HAVE_NEON_I8MM)
+#include "filter-neon-i8mm.h"
+#if !HIGH_BIT_DEPTH
+
+#include "mem-neon.h"
+
+#include <arm_neon.h>
+
+namespace {
+static const uint8_t dotprod_permute_tbl48 = {
+    0, 1,  2,  3, 1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5, 6,
+    4, 5,  6,  7, 5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10,
+    8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+};
+
+static const uint8_t matmul_permute_tbl232 = {
+    // Permute for luma filter 3.
+    { 0,  1,  2,  3,  4,  5,  6,  7,  2,  3,  4,  5,  6,  7,  8,  9,
+      4,  5,  6,  7,  8,  9, 10, 11,  6,  7,  8,  9, 10, 11, 12, 13 },
+    // Permute for luma filter 1.
+    { 1,  2,  3,  4,  5,  6,  7,  8,  3,  4,  5,  6,  7,  8,  9, 10,
+      5,  6,  7,  8,  9, 10, 11, 12,  7,  8,  9, 10, 11, 12, 13, 14 }
+};
+
+static const int8_t matmul_luma_filter216 = {
+    { -1, 4, -10, 58, 17, -5, 1, 0, 0, -1, 4, -10, 58, 17, -5, 1 },
+    { 1, -5, 17, 58, -10, 4, -1, 0, 0, 1, -5, 17, 58, -10, 4, -1 }
+};
+
+static const uint8_t dot_prod_merge_block_tbl48 = {
+    // Shift left and insert new last column in transposed 4x4 block.
+    1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
+    // Shift left and insert two new columns in transposed 4x4 block.
+    2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
+    // Shift left and insert three new columns in transposed 4x4 block.
+    3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
+};
+
+uint8x8_t inline filter8_8_pp(uint8x16_t samples, const int8x8_t filter,
+                              const uint8x16x3_t tbl)
+{
+    // Permute input samples for dot product.
+    // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+    uint8x16_t perm_s0 = vqtbl1q_u8(samples, tbl.val0);
+    // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+    uint8x16_t perm_s1 = vqtbl1q_u8(samples, tbl.val1);
+    // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+    uint8x16_t perm_S2 = vqtbl1q_u8(samples, tbl.val2);
+
+    int32x4_t dotprod_lo = vusdotq_lane_s32(vdupq_n_s32(0), perm_s0, filter, 0);
+    dotprod_lo = vusdotq_lane_s32(dotprod_lo, perm_s1, filter, 1);
+    int32x4_t dotprod_hi = vusdotq_lane_s32(vdupq_n_s32(0), perm_s1, filter, 0);
+    dotprod_hi = vusdotq_lane_s32(dotprod_hi, perm_S2, filter, 1);
+
+    // Narrow and combine.
+    int16x8_t dotprod = vcombine_s16(vmovn_s32(dotprod_lo),
+                                     vmovn_s32(dotprod_hi));
+    return vqrshrun_n_s16(dotprod, IF_FILTER_PREC);
+}
+
+void inline init_sample_permute(uint8x8_t *samples, const uint8x16x3_t tbl,
+                                uint8x16_t *d)
+{
+    // Permute input samples for dot product.
+    // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+    d0 = vqtbl1q_u8(vcombine_u8(samples0, vdup_n_u8(0)), tbl.val0);
+    d1 = vqtbl1q_u8(vcombine_u8(samples1, vdup_n_u8(0)), tbl.val0);
+    d2 = vqtbl1q_u8(vcombine_u8(samples2, vdup_n_u8(0)), tbl.val0);
+    d3 = vqtbl1q_u8(vcombine_u8(samples3, vdup_n_u8(0)), tbl.val0);
+}
+
+uint8x8_t inline filter8_8_pp_reuse(uint8x16_t samples, const int8x8_t filter,
+                                    const uint8x16x3_t tbl, uint8x16_t &perm_s0)
+{
+    // Permute input samples for dot product.
+    // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+    // Already in perm_s0.
+    // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+    uint8x16_t perm_s1 = vqtbl1q_u8(samples, tbl.val1);
+    // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+    uint8x16_t perm_s2 = vqtbl1q_u8(samples, tbl.val2);
+
+    int32x4_t dotprod_lo = vusdotq_lane_s32(vdupq_n_s32(0), perm_s0, filter, 0);
+    dotprod_lo = vusdotq_lane_s32(dotprod_lo, perm_s1, filter, 1);
+    int32x4_t dotprod_hi = vusdotq_lane_s32(vdupq_n_s32(0), perm_s1, filter, 0);
+    dotprod_hi = vusdotq_lane_s32(dotprod_hi, perm_s2, filter, 1);
+
+    // Save for re-use in next iteration.
+    perm_s0 = perm_s2;
+
+    // Narrow and combine.
+    int16x8_t dotprod = vcombine_s16(vmovn_s32(dotprod_lo),
+                                     vmovn_s32(dotprod_hi));
+    return vqrshrun_n_s16(dotprod, IF_FILTER_PREC);
+}
+
+uint8x8_t inline filter8_8_pp_matmul(uint8x16_t samples, const int8x16_t filter,
+                                     const uint8x16x2_t tbl)
+{
+    // Permute input samples for 8x2 by 2x8 matrix multiply.
+    uint8x16_t perm_s0 = vqtbl1q_u8(samples, tbl.val0);
+    uint8x16_t perm_s1 = vqtbl1q_u8(samples, tbl.val1);
+
+    int32x4_t matmul_lo = vusmmlaq_s32(vdupq_n_s32(0), perm_s0, filter);
+    int32x4_t matmul_hi = vusmmlaq_s32(vdupq_n_s32(0), perm_s1, filter);
+
+    // Narrow and combine.
+    int16x8_t matmul = vcombine_s16(vmovn_s32(matmul_lo), vmovn_s32(matmul_hi));
+    return vqrshrun_n_s16(matmul, IF_FILTER_PREC);
+}
+
+int16x4_t inline filter8_4_ps(uint8x16_t samples, const int8x8_t filter,
+                              const int16x8_t constant, const uint8x16x3_t tbl)
+{
+    // Permute input samples for dot product.
+    // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+    uint8x16_t perm_s0 = vqtbl1q_u8(samples, tbl.val0);
+    // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+    uint8x16_t perm_s1 = vqtbl1q_u8(samples, tbl.val1);
+
+    int32x4_t dotprod = vusdotq_lane_s32(vdupq_n_s32(0), perm_s0, filter, 0);
+    dotprod = vusdotq_lane_s32(dotprod, perm_s1, filter, 1);
+
+    // Narrow.
+    return vadd_s16(vmovn_s32(dotprod), vget_low_s16(constant));
+}
+
+int16x8_t inline filter8_8_ps(uint8x16_t samples, const int8x8_t filter,
+                              const int16x8_t constant, const uint8x16x3_t tbl)
+{
+    // Permute input samples for dot product.
+    // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+    uint8x16_t perm_s0 = vqtbl1q_u8(samples, tbl.val0);
+    // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+    uint8x16_t perm_s1 = vqtbl1q_u8(samples, tbl.val1);
+    // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+    uint8x16_t perm_S2 = vqtbl1q_u8(samples, tbl.val2);
+
+    int32x4_t dotprod_lo = vusdotq_lane_s32(vdupq_n_s32(0), perm_s0, filter, 0);
+    dotprod_lo = vusdotq_lane_s32(dotprod_lo, perm_s1, filter, 1);
+    int32x4_t dotprod_hi = vusdotq_lane_s32(vdupq_n_s32(0), perm_s1, filter, 0);
+    dotprod_hi = vusdotq_lane_s32(dotprod_hi, perm_S2, filter, 1);
+
+    // Narrow and combine.
+    int16x8_t dotprod = vcombine_s16(vmovn_s32(dotprod_lo),
+                                     vmovn_s32(dotprod_hi));
+    return vaddq_s16(dotprod, constant);
+}
+
+int16x8_t inline filter8_8_ps_reuse(uint8x16_t samples, const int8x8_t filter,
+                                    const int16x8_t constant,
+                                    const uint8x16x3_t tbl, uint8x16_t &perm_s0)
+{
+    // Permute input samples for dot product.
+    // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+    // Already in perm_s0.
+    // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+    uint8x16_t perm_s1 = vqtbl1q_u8(samples, tbl.val1);
+    // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+    uint8x16_t perm_s2 = vqtbl1q_u8(samples, tbl.val2);
+
+    int32x4_t dotprod_lo = vusdotq_lane_s32(vdupq_n_s32(0), perm_s0, filter, 0);
+    dotprod_lo = vusdotq_lane_s32(dotprod_lo, perm_s1, filter, 1);
+    int32x4_t dotprod_hi = vusdotq_lane_s32(vdupq_n_s32(0), perm_s1, filter, 0);
+    dotprod_hi = vusdotq_lane_s32(dotprod_hi, perm_s2, filter, 1);
+
+    // Save for re-use in next iteration.
+    perm_s0 = perm_s2;
+
+    // Narrow and combine.
+    int16x8_t dotprod = vcombine_s16(vmovn_s32(dotprod_lo),
+                                     vmovn_s32(dotprod_hi));
+    return vaddq_s16(dotprod, constant);
+}
+
+int16x8_t inline filter8_8_ps_matmul(uint8x16_t samples, const int8x16_t filter,
​

x265_4.0.tar.gz/source/common/aarch64/filter-neon-i8mm.h Added

@@ -0,0 +1,37 @@
+/*****************************************************************************
+ * Copyright (C) 2024 MulticoreWare, Inc
+ *
+ * Authors: Hari Limaye <hari.limaye@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_FILTER_NEON_I8MM_H
+#define X265_FILTER_NEON_I8MM_H
+
+#if defined(HAVE_NEON_I8MM)
+
+#include "primitives.h"
+
+namespace X265_NS {
+void setupFilterPrimitives_neon_i8mm(EncoderPrimitives &p);
+}
+
+#endif // defined(HAVE_NEON_I8MM)
+
+#endif // X265_FILTER_NEON_I8MM_H

 
@@ -0,0 +1,37 @@
+/*****************************************************************************
+ * Copyright (C) 2024 MulticoreWare, Inc
+ *
+ * Authors: Hari Limaye <hari.limaye@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_FILTER_NEON_I8MM_H
+#define X265_FILTER_NEON_I8MM_H
+
+#if defined(HAVE_NEON_I8MM)
+
+#include "primitives.h"
+
+namespace X265_NS {
+void setupFilterPrimitives_neon_i8mm(EncoderPrimitives &p);
+}
+
+#endif // defined(HAVE_NEON_I8MM)
+
+#endif // X265_FILTER_NEON_I8MM_H
​

x265_3.6.tar.gz/source/common/aarch64/filter-prim.cpp -> x265_4.0.tar.gz/source/common/aarch64/filter-prim.cpp Changed

@@ -1,37 +1,2114 @@
 #if HAVE_NEON
 
 #include "filter-prim.h"
+#include "mem-neon.h"
+
 #include <arm_neon.h>
 
-namespace
+namespace {
+void inline filter4_s16x8(int coeffIdx, const int16x8_t *s, const int16x4_t f,
+                          const int32x4_t c, int32x4_t &d0, int32x4_t &d1)
+{
+    if (coeffIdx == 4)
+    {
+        // { -4, 36, 36, -4 }
+        int16x8_t t0 = vaddq_s16(s1, s2);
+        int16x8_t t1 = vaddq_s16(s0, s3);
+        d0 = vmlal_n_s16(c, vget_low_s16(t0), 36);
+        d0 = vmlsl_n_s16(d0, vget_low_s16(t1), 4);
+
+        d1 = vmlal_n_s16(c, vget_high_s16(t0), 36);
+        d1 = vmlsl_n_s16(d1, vget_high_s16(t1), 4);
+    }
+    else
+    {
+        d0 = vmlal_lane_s16(c, vget_low_s16(s0), f, 0);
+        d0 = vmlal_lane_s16(d0, vget_low_s16(s1), f, 1);
+        d0 = vmlal_lane_s16(d0, vget_low_s16(s2), f, 2);
+        d0 = vmlal_lane_s16(d0, vget_low_s16(s3), f, 3);
+
+        d1 = vmlal_lane_s16(c, vget_high_s16(s0), f, 0);
+        d1 = vmlal_lane_s16(d1, vget_high_s16(s1), f, 1);
+        d1 = vmlal_lane_s16(d1, vget_high_s16(s2), f, 2);
+        d1 = vmlal_lane_s16(d1, vget_high_s16(s3), f, 3);
+    }
+}
+
+template<int coeffIdx>
+void inline filter8_s16x4(const int16x4_t *s, const int32x4_t c, int32x4_t &d)
+{
+    if (coeffIdx == 1)
+    {
+        // { -1, 4, -10, 58, 17, -5, 1, 0 }
+        d = vsubl_s16(s6, s0);
+        d = vaddq_s32(d, c);
+        d = vmlal_n_s16(d, s1, 4);
+        d = vmlsl_n_s16(d, s2, 10);
+        d = vmlal_n_s16(d, s3, 58);
+        d = vmlal_n_s16(d, s4, 17);
+        d = vmlsl_n_s16(d, s5, 5);
+    }
+    else if (coeffIdx == 2)
+    {
+        // { -1, 4, -11, 40, 40, -11, 4, -1 }
+        int32x4_t t0 = vaddl_s16(s3, s4);
+        int32x4_t t1 = vaddl_s16(s2, s5);
+        int32x4_t t2 = vaddl_s16(s1, s6);
+        int32x4_t t3 = vaddl_s16(s0, s7);
+
+        d = vmlaq_n_s32(c, t0, 40);
+        d = vmlaq_n_s32(d, t1, -11);
+        d = vmlaq_n_s32(d, t2, 4);
+        d = vmlaq_n_s32(d, t3, -1);
+    }
+    else
+    {
+        // { 0, 1, -5, 17, 58, -10, 4, -1 }
+        d = vsubl_s16(s1, s7);
+        d = vaddq_s32(d, c);
+        d = vmlal_n_s16(d, s6, 4);
+        d = vmlsl_n_s16(d, s5, 10);
+        d = vmlal_n_s16(d, s4, 58);
+        d = vmlal_n_s16(d, s3, 17);
+        d = vmlsl_n_s16(d, s2, 5);
+    }
+}
+
+template<int coeffIdx>
+void inline filter8_s16x8(const int16x8_t *s, const int32x4_t c, int32x4_t &d0,
+                          int32x4_t &d1)
+{
+    if (coeffIdx == 1)
+    {
+        // { -1, 4, -10, 58, 17, -5, 1, 0 }
+        d0 = vsubl_s16(vget_low_s16(s6), vget_low_s16(s0));
+        d0 = vaddq_s32(d0, c);
+        d0 = vmlal_n_s16(d0, vget_low_s16(s1), 4);
+        d0 = vmlsl_n_s16(d0, vget_low_s16(s2), 10);
+        d0 = vmlal_n_s16(d0, vget_low_s16(s3), 58);
+        d0 = vmlal_n_s16(d0, vget_low_s16(s4), 17);
+        d0 = vmlsl_n_s16(d0, vget_low_s16(s5), 5);
+
+        d1 = vsubl_s16(vget_high_s16(s6), vget_high_s16(s0));
+        d1 = vaddq_s32(d1, c);
+        d1 = vmlal_n_s16(d1, vget_high_s16(s1), 4);
+        d1 = vmlsl_n_s16(d1, vget_high_s16(s2), 10);
+        d1 = vmlal_n_s16(d1, vget_high_s16(s3), 58);
+        d1 = vmlal_n_s16(d1, vget_high_s16(s4), 17);
+        d1 = vmlsl_n_s16(d1, vget_high_s16(s5), 5);
+    }
+    else if (coeffIdx == 2)
+    {
+        // { -1, 4, -11, 40, 40, -11, 4, -1 }
+        int32x4_t t0 = vaddl_s16(vget_low_s16(s3), vget_low_s16(s4));
+        int32x4_t t1 = vaddl_s16(vget_low_s16(s2), vget_low_s16(s5));
+        int32x4_t t2 = vaddl_s16(vget_low_s16(s1), vget_low_s16(s6));
+        int32x4_t t3 = vaddl_s16(vget_low_s16(s0), vget_low_s16(s7));
+
+        d0 = vmlaq_n_s32(c, t0, 40);
+        d0 = vmlaq_n_s32(d0, t1, -11);
+        d0 = vmlaq_n_s32(d0, t2, 4);
+        d0 = vmlaq_n_s32(d0, t3, -1);
+
+        int32x4_t t4 = vaddl_s16(vget_high_s16(s3), vget_high_s16(s4));
+        int32x4_t t5 = vaddl_s16(vget_high_s16(s2), vget_high_s16(s5));
+        int32x4_t t6 = vaddl_s16(vget_high_s16(s1), vget_high_s16(s6));
+        int32x4_t t7 = vaddl_s16(vget_high_s16(s0), vget_high_s16(s7));
+
+        d1 = vmlaq_n_s32(c, t4, 40);
+        d1 = vmlaq_n_s32(d1, t5, -11);
+        d1 = vmlaq_n_s32(d1, t6, 4);
+        d1 = vmlaq_n_s32(d1, t7, -1);
+    }
+    else
+    {
+        // { 0, 1, -5, 17, 58, -10, 4, -1 }
+        d0 = vsubl_s16(vget_low_s16(s1), vget_low_s16(s7));
+        d0 = vaddq_s32(d0, c);
+        d0 = vmlal_n_s16(d0, vget_low_s16(s6), 4);
+        d0 = vmlsl_n_s16(d0, vget_low_s16(s5), 10);
+        d0 = vmlal_n_s16(d0, vget_low_s16(s4), 58);
+        d0 = vmlal_n_s16(d0, vget_low_s16(s3), 17);
+        d0 = vmlsl_n_s16(d0, vget_low_s16(s2), 5);
+
+        d1 = vsubl_s16(vget_high_s16(s1), vget_high_s16(s7));
+        d1 = vaddq_s32(d1, c);
+        d1 = vmlal_n_s16(d1, vget_high_s16(s6), 4);
+        d1 = vmlsl_n_s16(d1, vget_high_s16(s5), 10);
+        d1 = vmlal_n_s16(d1, vget_high_s16(s4), 58);
+        d1 = vmlal_n_s16(d1, vget_high_s16(s3), 17);
+        d1 = vmlsl_n_s16(d1, vget_high_s16(s2), 5);
+    }
+}
+
+template<int width, int height>
+void interp4_vert_ss_neon(const int16_t *src, intptr_t srcStride, int16_t *dst,
+                          intptr_t dstStride, int coeffIdx)
+{
+    const int N_TAPS = 4;
+    src -= (N_TAPS / 2 - 1) * srcStride;
+
+    const int16x4_t filter = vld1_s16(X265_NS::g_chromaFiltercoeffIdx);
+
+    // Zero constant in order to use filter helper functions (optimised away).
+    const int32x4_t c = vdupq_n_s32(0);
+
+    if (width == 12)
+    {
+        const int16_t *s = src;
+        int16_t *d = dst;
+
+        int16x8_t in7;
+        load_s16x8xn<3>(s, srcStride, in);
+        s += 3 * srcStride;
+
+        for (int row = 0; (row + 4) <= height; row += 4)
+        {
+            load_s16x8xn<4>(s, srcStride, in + 3);
+
+            int32x4_t sum_lo4;
+            int32x4_t sum_hi4;
+            filter4_s16x8(coeffIdx, in + 0, filter, c, sum_lo0, sum_hi0);
+            filter4_s16x8(coeffIdx, in + 1, filter, c, sum_lo1, sum_hi1);
+            filter4_s16x8(coeffIdx, in + 2, filter, c, sum_lo2, sum_hi2);
+            filter4_s16x8(coeffIdx, in + 3, filter, c, sum_lo3, sum_hi3);
+
+            int16x8_t sum4;
+            sum0 = vcombine_s16(vshrn_n_s32(sum_lo0, IF_FILTER_PREC),
+                                  vshrn_n_s32(sum_hi0, IF_FILTER_PREC));
+            sum1 = vcombine_s16(vshrn_n_s32(sum_lo1, IF_FILTER_PREC),
+                                  vshrn_n_s32(sum_hi1, IF_FILTER_PREC));
+            sum2 = vcombine_s16(vshrn_n_s32(sum_lo2, IF_FILTER_PREC),
+                                  vshrn_n_s32(sum_hi2, IF_FILTER_PREC));
+            sum3 = vcombine_s16(vshrn_n_s32(sum_lo3, IF_FILTER_PREC),
+                                  vshrn_n_s32(sum_hi3, IF_FILTER_PREC));
+
+            store_s16x8xn<4>(d, dstStride, sum);
+
+            in0 = in4;
+            in1 = in5;
+            in2 = in6;
+
+            s += 4 * srcStride;
+            d += 4 * dstStride;
+        }
+
+        src += 8;
+        dst += 8;
+        s = src;

 
@@ -1,37 +1,2114 @@
 #if HAVE_NEON
 
 #include "filter-prim.h"
+#include "mem-neon.h"
+
 #include <arm_neon.h>
 
-namespace
+namespace {
+void inline filter4_s16x8(int coeffIdx, const int16x8_t *s, const int16x4_t f,
+                          const int32x4_t c, int32x4_t &d0, int32x4_t &d1)
+{
+    if (coeffIdx == 4)
+    {
+        // { -4, 36, 36, -4 }
+        int16x8_t t0 = vaddq_s16(s1, s2);
+        int16x8_t t1 = vaddq_s16(s0, s3);
+        d0 = vmlal_n_s16(c, vget_low_s16(t0), 36);
+        d0 = vmlsl_n_s16(d0, vget_low_s16(t1), 4);
+
+        d1 = vmlal_n_s16(c, vget_high_s16(t0), 36);
+        d1 = vmlsl_n_s16(d1, vget_high_s16(t1), 4);
+    }
+    else
+    {
+        d0 = vmlal_lane_s16(c, vget_low_s16(s0), f, 0);
+        d0 = vmlal_lane_s16(d0, vget_low_s16(s1), f, 1);
+        d0 = vmlal_lane_s16(d0, vget_low_s16(s2), f, 2);
+        d0 = vmlal_lane_s16(d0, vget_low_s16(s3), f, 3);
+
+        d1 = vmlal_lane_s16(c, vget_high_s16(s0), f, 0);
+        d1 = vmlal_lane_s16(d1, vget_high_s16(s1), f, 1);
+        d1 = vmlal_lane_s16(d1, vget_high_s16(s2), f, 2);
+        d1 = vmlal_lane_s16(d1, vget_high_s16(s3), f, 3);
+    }
+}
+
+template<int coeffIdx>
+void inline filter8_s16x4(const int16x4_t *s, const int32x4_t c, int32x4_t &d)
+{
+    if (coeffIdx == 1)
+    {
+        // { -1, 4, -10, 58, 17, -5, 1, 0 }
+        d = vsubl_s16(s6, s0);
+        d = vaddq_s32(d, c);
+        d = vmlal_n_s16(d, s1, 4);
+        d = vmlsl_n_s16(d, s2, 10);
+        d = vmlal_n_s16(d, s3, 58);
+        d = vmlal_n_s16(d, s4, 17);
+        d = vmlsl_n_s16(d, s5, 5);
+    }
+    else if (coeffIdx == 2)
+    {
+        // { -1, 4, -11, 40, 40, -11, 4, -1 }
+        int32x4_t t0 = vaddl_s16(s3, s4);
+        int32x4_t t1 = vaddl_s16(s2, s5);
+        int32x4_t t2 = vaddl_s16(s1, s6);
+        int32x4_t t3 = vaddl_s16(s0, s7);
+
+        d = vmlaq_n_s32(c, t0, 40);
+        d = vmlaq_n_s32(d, t1, -11);
+        d = vmlaq_n_s32(d, t2, 4);
+        d = vmlaq_n_s32(d, t3, -1);
+    }
+    else
+    {
+        // { 0, 1, -5, 17, 58, -10, 4, -1 }
+        d = vsubl_s16(s1, s7);
+        d = vaddq_s32(d, c);
+        d = vmlal_n_s16(d, s6, 4);
+        d = vmlsl_n_s16(d, s5, 10);
+        d = vmlal_n_s16(d, s4, 58);
+        d = vmlal_n_s16(d, s3, 17);
+        d = vmlsl_n_s16(d, s2, 5);
+    }
+}
+
+template<int coeffIdx>
+void inline filter8_s16x8(const int16x8_t *s, const int32x4_t c, int32x4_t &d0,
+                          int32x4_t &d1)
+{
+    if (coeffIdx == 1)
+    {
+        // { -1, 4, -10, 58, 17, -5, 1, 0 }
+        d0 = vsubl_s16(vget_low_s16(s6), vget_low_s16(s0));
+        d0 = vaddq_s32(d0, c);
+        d0 = vmlal_n_s16(d0, vget_low_s16(s1), 4);
+        d0 = vmlsl_n_s16(d0, vget_low_s16(s2), 10);
+        d0 = vmlal_n_s16(d0, vget_low_s16(s3), 58);
+        d0 = vmlal_n_s16(d0, vget_low_s16(s4), 17);
+        d0 = vmlsl_n_s16(d0, vget_low_s16(s5), 5);
+
+        d1 = vsubl_s16(vget_high_s16(s6), vget_high_s16(s0));
+        d1 = vaddq_s32(d1, c);
+        d1 = vmlal_n_s16(d1, vget_high_s16(s1), 4);
+        d1 = vmlsl_n_s16(d1, vget_high_s16(s2), 10);
+        d1 = vmlal_n_s16(d1, vget_high_s16(s3), 58);
+        d1 = vmlal_n_s16(d1, vget_high_s16(s4), 17);
+        d1 = vmlsl_n_s16(d1, vget_high_s16(s5), 5);
+    }
+    else if (coeffIdx == 2)
+    {
+        // { -1, 4, -11, 40, 40, -11, 4, -1 }
+        int32x4_t t0 = vaddl_s16(vget_low_s16(s3), vget_low_s16(s4));
+        int32x4_t t1 = vaddl_s16(vget_low_s16(s2), vget_low_s16(s5));
+        int32x4_t t2 = vaddl_s16(vget_low_s16(s1), vget_low_s16(s6));
+        int32x4_t t3 = vaddl_s16(vget_low_s16(s0), vget_low_s16(s7));
+
+        d0 = vmlaq_n_s32(c, t0, 40);
+        d0 = vmlaq_n_s32(d0, t1, -11);
+        d0 = vmlaq_n_s32(d0, t2, 4);
+        d0 = vmlaq_n_s32(d0, t3, -1);
+
+        int32x4_t t4 = vaddl_s16(vget_high_s16(s3), vget_high_s16(s4));
+        int32x4_t t5 = vaddl_s16(vget_high_s16(s2), vget_high_s16(s5));
+        int32x4_t t6 = vaddl_s16(vget_high_s16(s1), vget_high_s16(s6));
+        int32x4_t t7 = vaddl_s16(vget_high_s16(s0), vget_high_s16(s7));
+
+        d1 = vmlaq_n_s32(c, t4, 40);
+        d1 = vmlaq_n_s32(d1, t5, -11);
+        d1 = vmlaq_n_s32(d1, t6, 4);
+        d1 = vmlaq_n_s32(d1, t7, -1);
+    }
+    else
+    {
+        // { 0, 1, -5, 17, 58, -10, 4, -1 }
+        d0 = vsubl_s16(vget_low_s16(s1), vget_low_s16(s7));
+        d0 = vaddq_s32(d0, c);
+        d0 = vmlal_n_s16(d0, vget_low_s16(s6), 4);
+        d0 = vmlsl_n_s16(d0, vget_low_s16(s5), 10);
+        d0 = vmlal_n_s16(d0, vget_low_s16(s4), 58);
+        d0 = vmlal_n_s16(d0, vget_low_s16(s3), 17);
+        d0 = vmlsl_n_s16(d0, vget_low_s16(s2), 5);
+
+        d1 = vsubl_s16(vget_high_s16(s1), vget_high_s16(s7));
+        d1 = vaddq_s32(d1, c);
+        d1 = vmlal_n_s16(d1, vget_high_s16(s6), 4);
+        d1 = vmlsl_n_s16(d1, vget_high_s16(s5), 10);
+        d1 = vmlal_n_s16(d1, vget_high_s16(s4), 58);
+        d1 = vmlal_n_s16(d1, vget_high_s16(s3), 17);
+        d1 = vmlsl_n_s16(d1, vget_high_s16(s2), 5);
+    }
+}
+
+template<int width, int height>
+void interp4_vert_ss_neon(const int16_t *src, intptr_t srcStride, int16_t *dst,
+                          intptr_t dstStride, int coeffIdx)
+{
+    const int N_TAPS = 4;
+    src -= (N_TAPS / 2 - 1) * srcStride;
+
+    const int16x4_t filter = vld1_s16(X265_NS::g_chromaFiltercoeffIdx);
+
+    // Zero constant in order to use filter helper functions (optimised away).
+    const int32x4_t c = vdupq_n_s32(0);
+
+    if (width == 12)
+    {
+        const int16_t *s = src;
+        int16_t *d = dst;
+
+        int16x8_t in7;
+        load_s16x8xn<3>(s, srcStride, in);
+        s += 3 * srcStride;
+
+        for (int row = 0; (row + 4) <= height; row += 4)
+        {
+            load_s16x8xn<4>(s, srcStride, in + 3);
+
+            int32x4_t sum_lo4;
+            int32x4_t sum_hi4;
+            filter4_s16x8(coeffIdx, in + 0, filter, c, sum_lo0, sum_hi0);
+            filter4_s16x8(coeffIdx, in + 1, filter, c, sum_lo1, sum_hi1);
+            filter4_s16x8(coeffIdx, in + 2, filter, c, sum_lo2, sum_hi2);
+            filter4_s16x8(coeffIdx, in + 3, filter, c, sum_lo3, sum_hi3);
+
+            int16x8_t sum4;
+            sum0 = vcombine_s16(vshrn_n_s32(sum_lo0, IF_FILTER_PREC),
+                                  vshrn_n_s32(sum_hi0, IF_FILTER_PREC));
+            sum1 = vcombine_s16(vshrn_n_s32(sum_lo1, IF_FILTER_PREC),
+                                  vshrn_n_s32(sum_hi1, IF_FILTER_PREC));
+            sum2 = vcombine_s16(vshrn_n_s32(sum_lo2, IF_FILTER_PREC),
+                                  vshrn_n_s32(sum_hi2, IF_FILTER_PREC));
+            sum3 = vcombine_s16(vshrn_n_s32(sum_lo3, IF_FILTER_PREC),
+                                  vshrn_n_s32(sum_hi3, IF_FILTER_PREC));
+
+            store_s16x8xn<4>(d, dstStride, sum);
+
+            in0 = in4;
+            in1 = in5;
+            in2 = in6;
+
+            s += 4 * srcStride;
+            d += 4 * dstStride;
+        }
+
+        src += 8;
+        dst += 8;
+        s = src;
​

x265_3.6.tar.gz/source/common/aarch64/fun-decls.h -> x265_4.0.tar.gz/source/common/aarch64/fun-decls.h Changed

@@ -69,6 +69,24 @@
     ret PFX(name ## _64x16_ ## cpu)(__VA_ARGS__); \
     ret PFX(name ## _16x64_ ## cpu)(__VA_ARGS__)
 
+#define FUNCDEF_PU_MULT_16(ret, name, cpu, ...) \
+    ret PFX(name ## _16x16_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _32x32_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _64x64_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _16x8_  ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _16x32_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _32x16_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _64x32_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _32x64_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _16x12_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _16x4_  ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _32x24_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _32x8_  ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _64x48_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _48x64_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _64x16_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _16x64_ ## cpu)(__VA_ARGS__)
+
 #define FUNCDEF_CHROMA_PU(ret, name, cpu, ...) \
     FUNCDEF_PU(ret, name, cpu, __VA_ARGS__); \
     ret PFX(name ## _4x2_ ## cpu)(__VA_ARGS__); \
@@ -113,23 +131,8 @@
     FUNCDEF_CHROMA_PU(void, blockcopy_pp, cpu, pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); \
     FUNCDEF_PU(void, blockcopy_sp, cpu, pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); \
     FUNCDEF_PU(void, blockcopy_ps, cpu, int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); \
-    FUNCDEF_PU(void, interp_8tap_horiz_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
-    FUNCDEF_PU(void, interp_8tap_horiz_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt); \
-    FUNCDEF_PU(void, interp_8tap_vert_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
-    FUNCDEF_PU(void, interp_8tap_vert_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \
-    FUNCDEF_PU(void, interp_8tap_vert_sp, cpu, const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
-    FUNCDEF_PU(void, interp_8tap_vert_ss, cpu, const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \
-    FUNCDEF_PU(void, interp_8tap_hv_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY); \
     FUNCDEF_CHROMA_PU(void, filterPixelToShort, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride); \
     FUNCDEF_CHROMA_PU(void, filterPixelToShort_aligned, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride); \
-    FUNCDEF_CHROMA_PU(void, interp_horiz_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
-    FUNCDEF_CHROMA_PU(void, interp_4tap_horiz_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
-    FUNCDEF_CHROMA_PU(void, interp_horiz_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt); \
-    FUNCDEF_CHROMA_PU(void, interp_4tap_horiz_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt); \
-    FUNCDEF_CHROMA_PU(void, interp_4tap_vert_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
-    FUNCDEF_CHROMA_PU(void, interp_4tap_vert_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \
-    FUNCDEF_CHROMA_PU(void, interp_4tap_vert_sp, cpu, const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
-    FUNCDEF_CHROMA_PU(void, interp_4tap_vert_ss, cpu, const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \
     FUNCDEF_CHROMA_PU(void, addAvg, cpu, const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t); \
     FUNCDEF_CHROMA_PU(void, addAvg_aligned, cpu, const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t); \
     FUNCDEF_PU(void, pixel_avg_pp, cpu, pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int); \
@@ -154,70 +157,74 @@
 DECLS(sve);
 DECLS(sve2);
 
+FUNCDEF_PU_MULT_16(int, pixel_sad, neon_dotprod, const pixel*, intptr_t, const pixel*, intptr_t);
+FUNCDEF_PU_MULT_16(void, sad_x3, neon_dotprod, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*);
+FUNCDEF_PU_MULT_16(void, sad_x4, neon_dotprod, const pixel*, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*);
+FUNCDEF_PU(sse_t, pixel_sse_pp, neon_dotprod, const pixel*, intptr_t, const pixel*, intptr_t);
 
-void x265_pixel_planecopy_cp_neon(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
+void PFX(pixel_planecopy_cp_neon(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift));
 
-uint64_t x265_pixel_var_8x8_neon(const pixel* pix, intptr_t stride);
-uint64_t x265_pixel_var_16x16_neon(const pixel* pix, intptr_t stride);
-uint64_t x265_pixel_var_32x32_neon(const pixel* pix, intptr_t stride);
-uint64_t x265_pixel_var_64x64_neon(const pixel* pix, intptr_t stride);
+uint64_t PFX(pixel_var_8x8_neon(const pixel* pix, intptr_t stride));
+uint64_t PFX(pixel_var_16x16_neon(const pixel* pix, intptr_t stride));
+uint64_t PFX(pixel_var_32x32_neon(const pixel* pix, intptr_t stride));
+uint64_t PFX(pixel_var_64x64_neon(const pixel* pix, intptr_t stride));
 
-void x265_getResidual4_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
-void x265_getResidual8_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
-void x265_getResidual16_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
-void x265_getResidual32_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
+void PFX(getResidual4_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride));
+void PFX(getResidual8_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride));
+void PFX(getResidual16_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride));
+void PFX(getResidual32_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride));
 
-void x265_scale1D_128to64_neon(pixel *dst, const pixel *src);
-void x265_scale2D_64to32_neon(pixel* dst, const pixel* src, intptr_t stride);
+void PFX(scale1D_128to64_neon(pixel *dst, const pixel *src));
+void PFX(scale2D_64to32_neon(pixel* dst, const pixel* src, intptr_t stride));
 
-int x265_pixel_satd_4x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_4x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_4x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_4x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_8x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_8x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_8x12_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_8x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_8x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_8x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_12x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_12x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_16x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_16x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_16x12_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_16x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_16x24_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_16x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_16x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_24x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_24x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_32x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_32x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_32x24_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_32x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_32x48_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_32x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_48x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_64x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_64x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_64x48_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_64x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int PFX(pixel_satd_4x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_4x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_4x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_4x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_8x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_8x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_8x12_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_8x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_8x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_8x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_12x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_12x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_16x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_16x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_16x12_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_16x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_16x24_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_16x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_16x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_24x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_24x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_32x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_32x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_32x24_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_32x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_32x48_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_32x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_48x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_64x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_64x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_64x48_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_64x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
 
-int x265_pixel_sa8d_8x8_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
-int x265_pixel_sa8d_8x16_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
-int x265_pixel_sa8d_16x16_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
-int x265_pixel_sa8d_16x32_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
-int x265_pixel_sa8d_32x32_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
-int x265_pixel_sa8d_32x64_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
-int x265_pixel_sa8d_64x64_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
+int PFX(pixel_sa8d_8x8_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2));
+int PFX(pixel_sa8d_8x16_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2));
+int PFX(pixel_sa8d_16x16_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2));
+int PFX(pixel_sa8d_16x32_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2));
+int PFX(pixel_sa8d_32x32_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2));
+int PFX(pixel_sa8d_32x64_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2));
+int PFX(pixel_sa8d_64x64_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2));
 
 uint32_t PFX(quant_neon)(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);
 uint32_t PFX(nquant_neon)(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff);
 
-void x265_dequant_scaling_neon(const int16_t* quantCoef, const int32_t* deQuantCoef, int16_t* coef, int num, int per, int shift);
-void x265_dequant_normal_neon(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift);
+void PFX(dequant_scaling_neon(const int16_t* quantCoef, const int32_t* deQuantCoef, int16_t* coef, int num, int per, int shift));
+void PFX(dequant_normal_neon(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift));
 
-void x265_ssim_4x4x2_core_neon(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums24);
+void PFX(ssim_4x4x2_core_neon(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums24));
 
 int PFX(psyCost_4x4_neon)(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
 int PFX(psyCost_8x8_neon)(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
@@ -226,30 +233,28 @@
 int PFX(scanPosLast_neon)(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize);
 uint32_t PFX(costCoeffNxN_neon)(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase);
 
-uint64_t x265_pixel_var_8x8_sve2(const pixel* pix, intptr_t stride);
-uint64_t x265_pixel_var_16x16_sve2(const pixel* pix, intptr_t stride);
-uint64_t x265_pixel_var_32x32_sve2(const pixel* pix, intptr_t stride);
-uint64_t x265_pixel_var_64x64_sve2(const pixel* pix, intptr_t stride);
-
-void x265_getResidual16_sve2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
-void x265_getResidual32_sve2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
+uint64_t PFX(pixel_var_8x8_sve2(const pixel* pix, intptr_t stride));
+uint64_t PFX(pixel_var_16x16_sve2(const pixel* pix, intptr_t stride));
+uint64_t PFX(pixel_var_32x32_sve2(const pixel* pix, intptr_t stride));
+uint64_t PFX(pixel_var_64x64_sve2(const pixel* pix, intptr_t stride));
 
-void x265_scale1D_128to64_sve2(pixel *dst, const pixel *src);
-void x265_scale2D_64to32_sve2(pixel* dst, const pixel* src, intptr_t stride);
+void PFX(getResidual16_sve2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride));
+void PFX(getResidual32_sve2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride));
 
-int x265_pixel_satd_4x4_sve(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_8x4_sve(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);

 
@@ -69,6 +69,24 @@
     ret PFX(name ## _64x16_ ## cpu)(__VA_ARGS__); \
     ret PFX(name ## _16x64_ ## cpu)(__VA_ARGS__)
 
+#define FUNCDEF_PU_MULT_16(ret, name, cpu, ...) \
+    ret PFX(name ## _16x16_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _32x32_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _64x64_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _16x8_  ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _16x32_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _32x16_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _64x32_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _32x64_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _16x12_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _16x4_  ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _32x24_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _32x8_  ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _64x48_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _48x64_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _64x16_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _16x64_ ## cpu)(__VA_ARGS__)
+
 #define FUNCDEF_CHROMA_PU(ret, name, cpu, ...) \
     FUNCDEF_PU(ret, name, cpu, __VA_ARGS__); \
     ret PFX(name ## _4x2_ ## cpu)(__VA_ARGS__); \
@@ -113,23 +131,8 @@
     FUNCDEF_CHROMA_PU(void, blockcopy_pp, cpu, pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); \
     FUNCDEF_PU(void, blockcopy_sp, cpu, pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); \
     FUNCDEF_PU(void, blockcopy_ps, cpu, int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); \
-    FUNCDEF_PU(void, interp_8tap_horiz_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
-    FUNCDEF_PU(void, interp_8tap_horiz_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt); \
-    FUNCDEF_PU(void, interp_8tap_vert_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
-    FUNCDEF_PU(void, interp_8tap_vert_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \
-    FUNCDEF_PU(void, interp_8tap_vert_sp, cpu, const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
-    FUNCDEF_PU(void, interp_8tap_vert_ss, cpu, const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \
-    FUNCDEF_PU(void, interp_8tap_hv_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY); \
     FUNCDEF_CHROMA_PU(void, filterPixelToShort, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride); \
     FUNCDEF_CHROMA_PU(void, filterPixelToShort_aligned, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride); \
-    FUNCDEF_CHROMA_PU(void, interp_horiz_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
-    FUNCDEF_CHROMA_PU(void, interp_4tap_horiz_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
-    FUNCDEF_CHROMA_PU(void, interp_horiz_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt); \
-    FUNCDEF_CHROMA_PU(void, interp_4tap_horiz_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt); \
-    FUNCDEF_CHROMA_PU(void, interp_4tap_vert_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
-    FUNCDEF_CHROMA_PU(void, interp_4tap_vert_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \
-    FUNCDEF_CHROMA_PU(void, interp_4tap_vert_sp, cpu, const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
-    FUNCDEF_CHROMA_PU(void, interp_4tap_vert_ss, cpu, const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \
     FUNCDEF_CHROMA_PU(void, addAvg, cpu, const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t); \
     FUNCDEF_CHROMA_PU(void, addAvg_aligned, cpu, const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t); \
     FUNCDEF_PU(void, pixel_avg_pp, cpu, pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int); \
@@ -154,70 +157,74 @@
 DECLS(sve);
 DECLS(sve2);
 
+FUNCDEF_PU_MULT_16(int, pixel_sad, neon_dotprod, const pixel*, intptr_t, const pixel*, intptr_t);
+FUNCDEF_PU_MULT_16(void, sad_x3, neon_dotprod, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*);
+FUNCDEF_PU_MULT_16(void, sad_x4, neon_dotprod, const pixel*, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*);
+FUNCDEF_PU(sse_t, pixel_sse_pp, neon_dotprod, const pixel*, intptr_t, const pixel*, intptr_t);
 
-void x265_pixel_planecopy_cp_neon(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
+void PFX(pixel_planecopy_cp_neon(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift));
 
-uint64_t x265_pixel_var_8x8_neon(const pixel* pix, intptr_t stride);
-uint64_t x265_pixel_var_16x16_neon(const pixel* pix, intptr_t stride);
-uint64_t x265_pixel_var_32x32_neon(const pixel* pix, intptr_t stride);
-uint64_t x265_pixel_var_64x64_neon(const pixel* pix, intptr_t stride);
+uint64_t PFX(pixel_var_8x8_neon(const pixel* pix, intptr_t stride));
+uint64_t PFX(pixel_var_16x16_neon(const pixel* pix, intptr_t stride));
+uint64_t PFX(pixel_var_32x32_neon(const pixel* pix, intptr_t stride));
+uint64_t PFX(pixel_var_64x64_neon(const pixel* pix, intptr_t stride));
 
-void x265_getResidual4_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
-void x265_getResidual8_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
-void x265_getResidual16_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
-void x265_getResidual32_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
+void PFX(getResidual4_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride));
+void PFX(getResidual8_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride));
+void PFX(getResidual16_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride));
+void PFX(getResidual32_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride));
 
-void x265_scale1D_128to64_neon(pixel *dst, const pixel *src);
-void x265_scale2D_64to32_neon(pixel* dst, const pixel* src, intptr_t stride);
+void PFX(scale1D_128to64_neon(pixel *dst, const pixel *src));
+void PFX(scale2D_64to32_neon(pixel* dst, const pixel* src, intptr_t stride));
 
-int x265_pixel_satd_4x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_4x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_4x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_4x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_8x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_8x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_8x12_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_8x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_8x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_8x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_12x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_12x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_16x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_16x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_16x12_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_16x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_16x24_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_16x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_16x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_24x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_24x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_32x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_32x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_32x24_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_32x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_32x48_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_32x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_48x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_64x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_64x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_64x48_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_64x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int PFX(pixel_satd_4x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_4x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_4x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_4x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_8x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_8x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_8x12_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_8x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_8x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_8x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_12x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_12x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_16x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_16x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_16x12_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_16x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_16x24_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_16x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_16x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_24x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_24x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_32x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_32x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_32x24_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_32x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_32x48_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_32x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_48x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_64x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_64x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_64x48_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_64x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
 
-int x265_pixel_sa8d_8x8_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
-int x265_pixel_sa8d_8x16_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
-int x265_pixel_sa8d_16x16_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
-int x265_pixel_sa8d_16x32_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
-int x265_pixel_sa8d_32x32_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
-int x265_pixel_sa8d_32x64_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
-int x265_pixel_sa8d_64x64_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
+int PFX(pixel_sa8d_8x8_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2));
+int PFX(pixel_sa8d_8x16_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2));
+int PFX(pixel_sa8d_16x16_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2));
+int PFX(pixel_sa8d_16x32_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2));
+int PFX(pixel_sa8d_32x32_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2));
+int PFX(pixel_sa8d_32x64_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2));
+int PFX(pixel_sa8d_64x64_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2));
 
 uint32_t PFX(quant_neon)(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);
 uint32_t PFX(nquant_neon)(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff);
 
-void x265_dequant_scaling_neon(const int16_t* quantCoef, const int32_t* deQuantCoef, int16_t* coef, int num, int per, int shift);
-void x265_dequant_normal_neon(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift);
+void PFX(dequant_scaling_neon(const int16_t* quantCoef, const int32_t* deQuantCoef, int16_t* coef, int num, int per, int shift));
+void PFX(dequant_normal_neon(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift));
 
-void x265_ssim_4x4x2_core_neon(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums24);
+void PFX(ssim_4x4x2_core_neon(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums24));
 
 int PFX(psyCost_4x4_neon)(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
 int PFX(psyCost_8x8_neon)(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
@@ -226,30 +233,28 @@
 int PFX(scanPosLast_neon)(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize);
 uint32_t PFX(costCoeffNxN_neon)(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase);
 
-uint64_t x265_pixel_var_8x8_sve2(const pixel* pix, intptr_t stride);
-uint64_t x265_pixel_var_16x16_sve2(const pixel* pix, intptr_t stride);
-uint64_t x265_pixel_var_32x32_sve2(const pixel* pix, intptr_t stride);
-uint64_t x265_pixel_var_64x64_sve2(const pixel* pix, intptr_t stride);
-
-void x265_getResidual16_sve2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
-void x265_getResidual32_sve2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
+uint64_t PFX(pixel_var_8x8_sve2(const pixel* pix, intptr_t stride));
+uint64_t PFX(pixel_var_16x16_sve2(const pixel* pix, intptr_t stride));
+uint64_t PFX(pixel_var_32x32_sve2(const pixel* pix, intptr_t stride));
+uint64_t PFX(pixel_var_64x64_sve2(const pixel* pix, intptr_t stride));
 
-void x265_scale1D_128to64_sve2(pixel *dst, const pixel *src);
-void x265_scale2D_64to32_sve2(pixel* dst, const pixel* src, intptr_t stride);
+void PFX(getResidual16_sve2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride));
+void PFX(getResidual32_sve2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride));
 
-int x265_pixel_satd_4x4_sve(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_8x4_sve(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
​

x265_3.6.tar.gz/source/common/aarch64/intrapred-prim.cpp -> x265_4.0.tar.gz/source/common/aarch64/intrapred-prim.cpp Changed

@@ -2,7 +2,7 @@
 #include "primitives.h"
 
 
-#if 1
+#if HAVE_NEON
 #include "arm64-utils.h"
 #include <arm_neon.h>
 
@@ -12,6 +12,52 @@
 {
 
 
+template<int tuSize>
+void intraFilter_neon(const pixel* samples, pixel* filtered) /* 1:2:1 filtering of left and top reference samples */
+{
+    const int tuSize2 = tuSize << 1;
+    pixel topLeft = samples0, topLast = samplestuSize2, leftLast = samplestuSize2 + tuSize2;
+
+    uint16x8_t two_vec = vdupq_n_u16(2);
+#if !HIGH_BIT_DEPTH
+    {
+        for(int i = 0; i < tuSize2 + tuSize2; i+=8)
+         {
+            uint16x8_t sample1 = vmovl_u8(vld1_u8(&samplesi));
+            uint16x8_t sample2 = vmovl_u8(vld1_u8(&samplesi-1));
+            uint16x8_t sample3 = vmovl_u8(vld1_u8(&samplesi+1));
+
+            uint16x8_t result1 = vaddq_u16(vshlq_n_u16(sample1,1), sample2 );
+            uint16x8_t result2 = vaddq_u16(sample3, two_vec);
+            uint16x8_t result3 = vaddq_u16(result1,result2);
+            vst1_u8(&filteredi , vmovn_u16(vshrq_n_u16(result3, 2)));
+        }
+    }
+#else
+    {
+        for(int i = 0; i < tuSize2 + tuSize2; i+=8)
+        {
+            uint16x8_t sample1 = vld1q_u16(&samplesi);
+            uint16x8_t sample2 = vld1q_u16(&samplesi-1);
+            uint16x8_t sample3 = vld1q_u16(&samplesi+1);
+
+            uint16x8_t result1 = vaddq_u16(vshlq_n_u16(sample1,1), sample2 );
+            uint16x8_t result2 = vaddq_u16(sample3, two_vec);
+            uint16x8_t result3 = vaddq_u16(result1,result2);
+            vst1q_u16(&filteredi , vshrq_n_u16(result3, 2));
+        }
+    }
+#endif
+    // filtering top
+    filteredtuSize2 = topLast;
+
+    // filtering top-left
+    filtered0 = ((topLeft << 1) + samples1 + samplestuSize2 + 1 + 2) >> 2;
+
+    // filtering left
+    filteredtuSize2 + 1 = ((samplestuSize2 + 1 << 1) + topLeft + samplestuSize2 + 2 + 2) >> 2;
+    filteredtuSize2 + tuSize2 = leftLast;
+}
 
 template<int width>
 void intra_pred_ang_neon(pixel *dst, intptr_t dstStride, const pixel *srcPix0, int dirMode, int bFilter)
@@ -105,30 +151,42 @@
             {
                 if (width >= 8 && sizeof(pixel) == 1)
                 {
-                    const int16x8_t f0 = vdupq_n_s16(32 - fraction);
-                    const int16x8_t f1 = vdupq_n_s16(fraction);
+                    // We have to cast to the 'real' type so that this block
+                    // will compile for both low and high bitdepth.
+                    const uint8_t *ref_u8 = (const uint8_t *)ref + offset;
+                    uint8_t *dst_u8 = (uint8_t *)dst;
+
+                    // f0 and f1 are unsigned (fraction is in range 0, 31).
+                    const uint8x8_t f0 = vdup_n_u8(32 - fraction);
+                    const uint8x8_t f1 = vdup_n_u8(fraction);
                     for (int x = 0; x < width; x += 8)
                     {
-                        uint8x8_t in0 = *(uint8x8_t *)&refoffset + x;
-                        uint8x8_t in1 = *(uint8x8_t *)&refoffset + x + 1;
-                        int16x8_t lo = vmlaq_s16(vdupq_n_s16(16), vmovl_u8(in0), f0);
-                        lo = vmlaq_s16(lo, vmovl_u8(in1), f1);
-                        lo = vshrq_n_s16(lo, 5);
-                        *(uint8x8_t *)&dsty * dstStride + x = vmovn_u16(lo);
+                        uint8x8_t in0 = vld1_u8(ref_u8 + x);
+                        uint8x8_t in1 = vld1_u8(ref_u8 + x + 1);
+                        uint16x8_t lo = vmlal_u8(vdupq_n_u16(16), in0, f0);
+                        lo = vmlal_u8(lo, in1, f1);
+                        uint8x8_t res = vshrn_n_u16(lo, 5);
+                        vst1_u8(dst_u8 + y * dstStride + x, res);
                     }
                 }
                 else if (width >= 4 && sizeof(pixel) == 2)
                 {
-                    const int32x4_t f0 = vdupq_n_s32(32 - fraction);
-                    const int32x4_t f1 = vdupq_n_s32(fraction);
+                    // We have to cast to the 'real' type so that this block
+                    // will compile for both low and high bitdepth.
+                    const uint16_t *ref_u16 = (const uint16_t *)ref + offset;
+                    uint16_t *dst_u16 = (uint16_t *)dst;
+
+                    // f0 and f1 are unsigned (fraction is in range 0, 31).
+                    const uint16x4_t f0 = vdup_n_u16(32 - fraction);
+                    const uint16x4_t f1 = vdup_n_u16(fraction);
                     for (int x = 0; x < width; x += 4)
                     {
-                        uint16x4_t in0 = *(uint16x4_t *)&refoffset + x;
-                        uint16x4_t in1 = *(uint16x4_t *)&refoffset + x + 1;
-                        int32x4_t lo = vmlaq_s32(vdupq_n_s32(16), vmovl_u16(in0), f0);
-                        lo = vmlaq_s32(lo, vmovl_u16(in1), f1);
-                        lo = vshrq_n_s32(lo, 5);
-                        *(uint16x4_t *)&dsty * dstStride + x = vmovn_u32(lo);
+                        uint16x4_t in0 = vld1_u16(ref_u16 + x);
+                        uint16x4_t in1 = vld1_u16(ref_u16 + x + 1);
+                        uint32x4_t lo = vmlal_u16(vdupq_n_u32(16), in0, f0);
+                        lo = vmlal_u16(lo, in1, f1);
+                        uint16x4_t res = vshrn_n_u32(lo, 5);
+                        vst1_u16(dst_u16 + y * dstStride + x, res);
                     }
                 }
                 else
@@ -176,6 +234,7 @@
     }
 }
 
+#endif
 template<int log2Size>
 void all_angs_pred_neon(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma)
 {
@@ -220,14 +279,285 @@
         }
     }
 }
+
+template<int log2Size>
+void planar_pred_neon(pixel * dst, intptr_t dstStride, const pixel * srcPix, int /*dirMode*/, int /*bFilter*/)
+{
+    const int blkSize = 1 << log2Size;
+
+    const pixel* above = srcPix + 1;
+    const pixel* left = srcPix + (2 * blkSize + 1);
+
+    switch (blkSize) {
+    case 8:
+    {
+        const uint16_t log2SizePlusOne = log2Size + 1;
+        uint16x8_t blkSizeVec = vdupq_n_u16(blkSize);
+        uint16x8_t topRight = vdupq_n_u16(aboveblkSize);
+        uint16_t bottomLeft = leftblkSize;
+        uint16x8_t oneVec = vdupq_n_u16(1);
+        uint16x8_t blkSizeSubOneVec = vdupq_n_u16(blkSize - 1);
+
+        for (int y = 0; y < blkSize; y++) {
+            // (blkSize - 1 - y)
+            uint16x8_t vlkSizeYVec = vdupq_n_u16(blkSize - 1 - y);
+            // (y + 1) * bottomLeft
+            uint16x8_t bottomLeftYVec = vdupq_n_u16((y + 1) * bottomLeft);
+            // lefty
+            uint16x8_t leftYVec = vdupq_n_u16(lefty);
+
+            for (int x = 0; x < blkSize; x += 8) {
+                int idx = y * dstStride + x;
+                uint16x8_t xvec = { (uint16_t)(x + 0), (uint16_t)(x + 1),
+                                    (uint16_t)(x + 2), (uint16_t)(x + 3),
+                                    (uint16_t)(x + 4), (uint16_t)(x + 5),
+                                    (uint16_t)(x + 6), (uint16_t)(x + 7) };
+
+                // (blkSize - 1 - y) * abovex
+                uint16x8_t aboveVec = { (uint16_t)(abovex + 0),
+                                        (uint16_t)(abovex + 1),
+                                        (uint16_t)(abovex + 2),
+                                        (uint16_t)(abovex + 3),
+                                        (uint16_t)(abovex + 4),
+                                        (uint16_t)(abovex + 5),
+                                        (uint16_t)(abovex + 6),
+                                        (uint16_t)(abovex + 7) };
+
+                aboveVec = vmulq_u16(aboveVec, vlkSizeYVec);
+
+                // (blkSize - 1 - x) * lefty
+                uint16x8_t first = vsubq_u16(blkSizeSubOneVec, xvec);
+                first = vmulq_u16(first, leftYVec);
+
+                // (x + 1) * topRight
+                uint16x8_t second = vaddq_u16(xvec, oneVec);
+                second = vmulq_u16(second, topRight);
+
+                uint16x8_t resVec = vaddq_u16(first, second);
+                resVec = vaddq_u16(resVec, aboveVec);
+                resVec = vaddq_u16(resVec, bottomLeftYVec);
+                resVec = vaddq_u16(resVec, blkSizeVec);
+                resVec = vshrq_n_u16(resVec, log2SizePlusOne);
+
+                for (int i = 0; i < 8; i++)
+                    dstidx + i = (pixel)resVeci;
+    }
+}
+        }
+    break;
+    case 4:

 
@@ -2,7 +2,7 @@
 #include "primitives.h"
 
 
-#if 1
+#if HAVE_NEON
 #include "arm64-utils.h"
 #include <arm_neon.h>
 
@@ -12,6 +12,52 @@
 {
 
 
+template<int tuSize>
+void intraFilter_neon(const pixel* samples, pixel* filtered) /* 1:2:1 filtering of left and top reference samples */
+{
+    const int tuSize2 = tuSize << 1;
+    pixel topLeft = samples0, topLast = samplestuSize2, leftLast = samplestuSize2 + tuSize2;
+
+    uint16x8_t two_vec = vdupq_n_u16(2);
+#if !HIGH_BIT_DEPTH
+    {
+        for(int i = 0; i < tuSize2 + tuSize2; i+=8)
+         {
+            uint16x8_t sample1 = vmovl_u8(vld1_u8(&samplesi));
+            uint16x8_t sample2 = vmovl_u8(vld1_u8(&samplesi-1));
+            uint16x8_t sample3 = vmovl_u8(vld1_u8(&samplesi+1));
+
+            uint16x8_t result1 = vaddq_u16(vshlq_n_u16(sample1,1), sample2 );
+            uint16x8_t result2 = vaddq_u16(sample3, two_vec);
+            uint16x8_t result3 = vaddq_u16(result1,result2);
+            vst1_u8(&filteredi , vmovn_u16(vshrq_n_u16(result3, 2)));
+        }
+    }
+#else
+    {
+        for(int i = 0; i < tuSize2 + tuSize2; i+=8)
+        {
+            uint16x8_t sample1 = vld1q_u16(&samplesi);
+            uint16x8_t sample2 = vld1q_u16(&samplesi-1);
+            uint16x8_t sample3 = vld1q_u16(&samplesi+1);
+
+            uint16x8_t result1 = vaddq_u16(vshlq_n_u16(sample1,1), sample2 );
+            uint16x8_t result2 = vaddq_u16(sample3, two_vec);
+            uint16x8_t result3 = vaddq_u16(result1,result2);
+            vst1q_u16(&filteredi , vshrq_n_u16(result3, 2));
+        }
+    }
+#endif
+    // filtering top
+    filteredtuSize2 = topLast;
+
+    // filtering top-left
+    filtered0 = ((topLeft << 1) + samples1 + samplestuSize2 + 1 + 2) >> 2;
+
+    // filtering left
+    filteredtuSize2 + 1 = ((samplestuSize2 + 1 << 1) + topLeft + samplestuSize2 + 2 + 2) >> 2;
+    filteredtuSize2 + tuSize2 = leftLast;
+}
 
 template<int width>
 void intra_pred_ang_neon(pixel *dst, intptr_t dstStride, const pixel *srcPix0, int dirMode, int bFilter)
@@ -105,30 +151,42 @@
             {
                 if (width >= 8 && sizeof(pixel) == 1)
                 {
-                    const int16x8_t f0 = vdupq_n_s16(32 - fraction);
-                    const int16x8_t f1 = vdupq_n_s16(fraction);
+                    // We have to cast to the 'real' type so that this block
+                    // will compile for both low and high bitdepth.
+                    const uint8_t *ref_u8 = (const uint8_t *)ref + offset;
+                    uint8_t *dst_u8 = (uint8_t *)dst;
+
+                    // f0 and f1 are unsigned (fraction is in range 0, 31).
+                    const uint8x8_t f0 = vdup_n_u8(32 - fraction);
+                    const uint8x8_t f1 = vdup_n_u8(fraction);
                     for (int x = 0; x < width; x += 8)
                     {
-                        uint8x8_t in0 = *(uint8x8_t *)&refoffset + x;
-                        uint8x8_t in1 = *(uint8x8_t *)&refoffset + x + 1;
-                        int16x8_t lo = vmlaq_s16(vdupq_n_s16(16), vmovl_u8(in0), f0);
-                        lo = vmlaq_s16(lo, vmovl_u8(in1), f1);
-                        lo = vshrq_n_s16(lo, 5);
-                        *(uint8x8_t *)&dsty * dstStride + x = vmovn_u16(lo);
+                        uint8x8_t in0 = vld1_u8(ref_u8 + x);
+                        uint8x8_t in1 = vld1_u8(ref_u8 + x + 1);
+                        uint16x8_t lo = vmlal_u8(vdupq_n_u16(16), in0, f0);
+                        lo = vmlal_u8(lo, in1, f1);
+                        uint8x8_t res = vshrn_n_u16(lo, 5);
+                        vst1_u8(dst_u8 + y * dstStride + x, res);
                     }
                 }
                 else if (width >= 4 && sizeof(pixel) == 2)
                 {
-                    const int32x4_t f0 = vdupq_n_s32(32 - fraction);
-                    const int32x4_t f1 = vdupq_n_s32(fraction);
+                    // We have to cast to the 'real' type so that this block
+                    // will compile for both low and high bitdepth.
+                    const uint16_t *ref_u16 = (const uint16_t *)ref + offset;
+                    uint16_t *dst_u16 = (uint16_t *)dst;
+
+                    // f0 and f1 are unsigned (fraction is in range 0, 31).
+                    const uint16x4_t f0 = vdup_n_u16(32 - fraction);
+                    const uint16x4_t f1 = vdup_n_u16(fraction);
                     for (int x = 0; x < width; x += 4)
                     {
-                        uint16x4_t in0 = *(uint16x4_t *)&refoffset + x;
-                        uint16x4_t in1 = *(uint16x4_t *)&refoffset + x + 1;
-                        int32x4_t lo = vmlaq_s32(vdupq_n_s32(16), vmovl_u16(in0), f0);
-                        lo = vmlaq_s32(lo, vmovl_u16(in1), f1);
-                        lo = vshrq_n_s32(lo, 5);
-                        *(uint16x4_t *)&dsty * dstStride + x = vmovn_u32(lo);
+                        uint16x4_t in0 = vld1_u16(ref_u16 + x);
+                        uint16x4_t in1 = vld1_u16(ref_u16 + x + 1);
+                        uint32x4_t lo = vmlal_u16(vdupq_n_u32(16), in0, f0);
+                        lo = vmlal_u16(lo, in1, f1);
+                        uint16x4_t res = vshrn_n_u32(lo, 5);
+                        vst1_u16(dst_u16 + y * dstStride + x, res);
                     }
                 }
                 else
@@ -176,6 +234,7 @@
     }
 }
 
+#endif
 template<int log2Size>
 void all_angs_pred_neon(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma)
 {
@@ -220,14 +279,285 @@
         }
     }
 }
+
+template<int log2Size>
+void planar_pred_neon(pixel * dst, intptr_t dstStride, const pixel * srcPix, int /*dirMode*/, int /*bFilter*/)
+{
+    const int blkSize = 1 << log2Size;
+
+    const pixel* above = srcPix + 1;
+    const pixel* left = srcPix + (2 * blkSize + 1);
+
+    switch (blkSize) {
+    case 8:
+    {
+        const uint16_t log2SizePlusOne = log2Size + 1;
+        uint16x8_t blkSizeVec = vdupq_n_u16(blkSize);
+        uint16x8_t topRight = vdupq_n_u16(aboveblkSize);
+        uint16_t bottomLeft = leftblkSize;
+        uint16x8_t oneVec = vdupq_n_u16(1);
+        uint16x8_t blkSizeSubOneVec = vdupq_n_u16(blkSize - 1);
+
+        for (int y = 0; y < blkSize; y++) {
+            // (blkSize - 1 - y)
+            uint16x8_t vlkSizeYVec = vdupq_n_u16(blkSize - 1 - y);
+            // (y + 1) * bottomLeft
+            uint16x8_t bottomLeftYVec = vdupq_n_u16((y + 1) * bottomLeft);
+            // lefty
+            uint16x8_t leftYVec = vdupq_n_u16(lefty);
+
+            for (int x = 0; x < blkSize; x += 8) {
+                int idx = y * dstStride + x;
+                uint16x8_t xvec = { (uint16_t)(x + 0), (uint16_t)(x + 1),
+                                    (uint16_t)(x + 2), (uint16_t)(x + 3),
+                                    (uint16_t)(x + 4), (uint16_t)(x + 5),
+                                    (uint16_t)(x + 6), (uint16_t)(x + 7) };
+
+                // (blkSize - 1 - y) * abovex
+                uint16x8_t aboveVec = { (uint16_t)(abovex + 0),
+                                        (uint16_t)(abovex + 1),
+                                        (uint16_t)(abovex + 2),
+                                        (uint16_t)(abovex + 3),
+                                        (uint16_t)(abovex + 4),
+                                        (uint16_t)(abovex + 5),
+                                        (uint16_t)(abovex + 6),
+                                        (uint16_t)(abovex + 7) };
+
+                aboveVec = vmulq_u16(aboveVec, vlkSizeYVec);
+
+                // (blkSize - 1 - x) * lefty
+                uint16x8_t first = vsubq_u16(blkSizeSubOneVec, xvec);
+                first = vmulq_u16(first, leftYVec);
+
+                // (x + 1) * topRight
+                uint16x8_t second = vaddq_u16(xvec, oneVec);
+                second = vmulq_u16(second, topRight);
+
+                uint16x8_t resVec = vaddq_u16(first, second);
+                resVec = vaddq_u16(resVec, aboveVec);
+                resVec = vaddq_u16(resVec, bottomLeftYVec);
+                resVec = vaddq_u16(resVec, blkSizeVec);
+                resVec = vshrq_n_u16(resVec, log2SizePlusOne);
+
+                for (int i = 0; i < 8; i++)
+                    dstidx + i = (pixel)resVeci;
+    }
+}
+        }
+    break;
+    case 4:
​

x265_4.0.tar.gz/source/common/aarch64/intrapred.S Added

@@ -0,0 +1,171 @@
+/*****************************************************************************
+ * Copyright (C) 2021 MulticoreWare, Inc
+ *
+ * Authors: Min Chen <min.chen@multicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+// Functions in this file:
+// ***** luma_vpp *****
+
+#include "asm.S"
+
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
+.section .rodata
+#endif
+
+.align 4
+
+.text
+
+.align 4
+tbl_const_1to8_7to0:
+    .byte 1, 2, 3, 4, 5, 6, 7, 8
+    .byte 7, 6, 5, 4, 3, 2, 1, 0
+    .byte 9, 10, 11, 12, 13, 14, 15, 16
+    .byte 15, 14, 13, 12, 11, 10, 9, 8
+
+// ***** planar_pred *****
+// void planar_pred(pixel* dst, intptr_t dstStride, const pixel* srcPix, int /*dirMode*/, int /*bFilter*/)
+function PFX(intra_pred_planar8_neon)
+// Register map
+// x0  = dst
+// x1  = dstStride
+// x2  = *srcPix
+// x3  = leftx
+// x4  = tmp
+// v0  = above7:0
+// v1  = left7:0
+// v2  = topRight = rep(aboveblkSize)
+// v3  = bottomLeft = rep(leftblkSize)
+// v4  = const8 7 6 5 4 3 2 1
+// v5  = const7 6 5 4 3 2 1 0
+
+//{
+//    const int blkSize = 1 << log2Size;
+//    const pixel* above = srcPix + 1;
+//    const pixel* left  = srcPix + (2 * blkSize + 1);
+//    pixel topRight = aboveblkSize;
+//    pixel bottomLeft = leftblkSize;
+//    for (int y = 0; y < blkSize; y++)
+//        for (int x = 0; x < blkSize; x++)
+//            dsty * dstStride + x = (pixel) (((blkSize - 1 - x) * lefty + (blkSize - 1 -y) * abovex + (x + 1) * topRight + (y + 1) * bottomLeft + blkSize) >> (log2Size + 1));
+//}
+
+    ldurb           w3, x2, #(1+8)                // topRight
+    ldurb           w4, x2, #(2*8+1+8)            // bottomLeft
+    dup             v2.8b, w3                       // v2 = topRight_b
+    dup             v3.8h, w4                       // v3 = bottomLeft_h
+    ldr             x3, x2, #(2*8+1)              // x3 = leftx_b
+    ldr             d0, x2, #1                    // v0 = abovex_b
+
+    adr             x4, tbl_const_1to8_7to0
+    ldr             d4, x4                        // v4 = const_b8 7 6 5 4 3 2 1
+    ldr             d5, x4, #8                    // v5 = const_b7 6 5 4 3 2 1 0
+
+    ushll           v6.8h, v0.8b, #3                // v6 = 8 * abovex
+    usubw           v0.8h, v3.8h, v0.8b             // v0 = bottomLeft - abovex
+
+    umlal           v6.8h, v4.8b, v2.8b             // v6 = 8 * abovex + (x + 1) * topRight
+
+    mov             w4, #8
+
+1:
+    dup             v1.8b, w3
+    lsr             x3, x3, #8
+    add             v6.8h, v6.8h, v0.8h             // v6 = (blkSize - 1 -y=0) * abovex + (x + 1) * topRight + (y=0 + 1) * bottomLeft
+    mov             v3.16b, v6.16b
+    umlal           v3.8h, v5.8b, v1.8b             // v3 = (blkSize - 1 - x) * lefty=0 + (blkSize - 1 -y=0) * abovex + (x + 1) * topRight + (y=0 + 1) * bottomLeft
+    rshrn           v3.8b, v3.8h, #4
+    sub             w4, w4, #1
+    st1             {v3.8b}, x0, x1
+    cbnz            w4, 1b
+
+    ret
+endfunc
+
+// void planar_pred(pixel* dst, intptr_t dstStride, const pixel* srcPix, int /*dirMode*/, int /*bFilter*/)
+function PFX(intra_pred_planar16_neon)
+// Register map
+// x0  = dst
+// x1  = dstStride
+// x2  = *srcPix
+// x3  = leftx
+// x4  = tmp
+// v0  = above7:0
+// v1  = left7:0
+// v2  = topRight = rep(aboveblkSize)
+// v3  = bottomLeft = rep(leftblkSize)
+// v4  = const16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1
+// v5  = const15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
+
+//{
+//    const int blkSize = 1 << log2Size;
+//    const pixel* above = srcPix + 1;
+//    const pixel* left  = srcPix + (2 * blkSize + 1);
+//    pixel topRight = aboveblkSize;
+//    pixel bottomLeft = leftblkSize;
+//    for (int y = 0; y < blkSize; y++)
+//        for (int x = 0; x < blkSize; x++)
+//            dsty * dstStride + x = (pixel) (((blkSize - 1 - x) * lefty + (blkSize - 1 -y) * abovex + (x + 1) * topRight + (y + 1) * bottomLeft + blkSize) >> (log2Size + 1));
+//}
+
+    ldurb           w3, x2, #(1+16)               // topRight
+    ldurb           w4, x2, #(2*16+1+16)          // bottomLeft
+    ldr             q0, x2, #(2*16+1)             // v0 = leftx_b
+    ldr             q1, x2, #1                    // v1 = abovex_b
+    dup             v2.16b, w3                      // v2 = topRight_b
+    dup             v3.8h, w4                       // v3 = bottomLeft_h
+
+    adr             x4, tbl_const_1to8_7to0
+    ld2             {v4.2d, v5.2d}, x4            // v4 = const_b16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1
+    ext             v5.16b, v5.16b, v5.16b, #8      // v5 = const_b15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
+
+    ushll           v16.8h, v1.8b, #4               // v16,v17 = 16 * abovex
+    ushll2          v17.8h, v1.16b, #4
+    usubw           v6.8h, v3.8h, v1.8b             // v6,v7 = bottomLeft - abovex
+    usubw2          v7.8h, v3.8h, v1.16b
+
+    umlal           v16.8h, v4.8b, v2.8b            // v16,v17 = 16 * abovex + (x + 1) * topRight
+    umlal2          v17.8h, v4.16b, v2.16b
+
+    mov             w4, #16
+
+1:
+    dup             v1.16b, v0.b0                 // v1 = leftx_b
+    ext             v0.16b, v0.16b, v0.16b, #1
+
+    add             v16.8h, v16.8h, v6.8h           // v16,v17 = (blkSize - 1 -y=0) * abovex + (x + 1) * topRight + (y=0 + 1) * bottomLeft
+    add             v17.8h, v17.8h, v7.8h
+
+    mov             v18.16b, v16.16b
+    mov             v19.16b, v17.16b
+
+    umlal           v18.8h, v5.8b, v1.8b             // v3 = (blkSize - 1 - x) * lefty=0 + (blkSize - 1 -y=0) * abovex + (x + 1) * topRight + (y=0 + 1) * bottomLeft
+    umlal2          v19.8h, v5.16b, v1.16b
+    rshrn           v18.8b, v18.8h, #5
+    rshrn2          v18.16b, v19.8h, #5
+    st1             {v18.16b}, x0, x1
+    sub             w4, w4, #1
+    cbnz            w4, 1b
+
+    ret
+endfunc

 
@@ -0,0 +1,171 @@
+/*****************************************************************************
+ * Copyright (C) 2021 MulticoreWare, Inc
+ *
+ * Authors: Min Chen <min.chen@multicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+// Functions in this file:
+// ***** luma_vpp *****
+
+#include "asm.S"
+
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
+.section .rodata
+#endif
+
+.align 4
+
+.text
+
+.align 4
+tbl_const_1to8_7to0:
+    .byte 1, 2, 3, 4, 5, 6, 7, 8
+    .byte 7, 6, 5, 4, 3, 2, 1, 0
+    .byte 9, 10, 11, 12, 13, 14, 15, 16
+    .byte 15, 14, 13, 12, 11, 10, 9, 8
+
+// ***** planar_pred *****
+// void planar_pred(pixel* dst, intptr_t dstStride, const pixel* srcPix, int /*dirMode*/, int /*bFilter*/)
+function PFX(intra_pred_planar8_neon)
+// Register map
+// x0  = dst
+// x1  = dstStride
+// x2  = *srcPix
+// x3  = leftx
+// x4  = tmp
+// v0  = above7:0
+// v1  = left7:0
+// v2  = topRight = rep(aboveblkSize)
+// v3  = bottomLeft = rep(leftblkSize)
+// v4  = const8 7 6 5 4 3 2 1
+// v5  = const7 6 5 4 3 2 1 0
+
+//{
+//    const int blkSize = 1 << log2Size;
+//    const pixel* above = srcPix + 1;
+//    const pixel* left  = srcPix + (2 * blkSize + 1);
+//    pixel topRight = aboveblkSize;
+//    pixel bottomLeft = leftblkSize;
+//    for (int y = 0; y < blkSize; y++)
+//        for (int x = 0; x < blkSize; x++)
+//            dsty * dstStride + x = (pixel) (((blkSize - 1 - x) * lefty + (blkSize - 1 -y) * abovex + (x + 1) * topRight + (y + 1) * bottomLeft + blkSize) >> (log2Size + 1));
+//}
+
+    ldurb           w3, x2, #(1+8)                // topRight
+    ldurb           w4, x2, #(2*8+1+8)            // bottomLeft
+    dup             v2.8b, w3                       // v2 = topRight_b
+    dup             v3.8h, w4                       // v3 = bottomLeft_h
+    ldr             x3, x2, #(2*8+1)              // x3 = leftx_b
+    ldr             d0, x2, #1                    // v0 = abovex_b
+
+    adr             x4, tbl_const_1to8_7to0
+    ldr             d4, x4                        // v4 = const_b8 7 6 5 4 3 2 1
+    ldr             d5, x4, #8                    // v5 = const_b7 6 5 4 3 2 1 0
+
+    ushll           v6.8h, v0.8b, #3                // v6 = 8 * abovex
+    usubw           v0.8h, v3.8h, v0.8b             // v0 = bottomLeft - abovex
+
+    umlal           v6.8h, v4.8b, v2.8b             // v6 = 8 * abovex + (x + 1) * topRight
+
+    mov             w4, #8
+
+1:
+    dup             v1.8b, w3
+    lsr             x3, x3, #8
+    add             v6.8h, v6.8h, v0.8h             // v6 = (blkSize - 1 -y=0) * abovex + (x + 1) * topRight + (y=0 + 1) * bottomLeft
+    mov             v3.16b, v6.16b
+    umlal           v3.8h, v5.8b, v1.8b             // v3 = (blkSize - 1 - x) * lefty=0 + (blkSize - 1 -y=0) * abovex + (x + 1) * topRight + (y=0 + 1) * bottomLeft
+    rshrn           v3.8b, v3.8h, #4
+    sub             w4, w4, #1
+    st1             {v3.8b}, x0, x1
+    cbnz            w4, 1b
+
+    ret
+endfunc
+
+// void planar_pred(pixel* dst, intptr_t dstStride, const pixel* srcPix, int /*dirMode*/, int /*bFilter*/)
+function PFX(intra_pred_planar16_neon)
+// Register map
+// x0  = dst
+// x1  = dstStride
+// x2  = *srcPix
+// x3  = leftx
+// x4  = tmp
+// v0  = above7:0
+// v1  = left7:0
+// v2  = topRight = rep(aboveblkSize)
+// v3  = bottomLeft = rep(leftblkSize)
+// v4  = const16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1
+// v5  = const15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
+
+//{
+//    const int blkSize = 1 << log2Size;
+//    const pixel* above = srcPix + 1;
+//    const pixel* left  = srcPix + (2 * blkSize + 1);
+//    pixel topRight = aboveblkSize;
+//    pixel bottomLeft = leftblkSize;
+//    for (int y = 0; y < blkSize; y++)
+//        for (int x = 0; x < blkSize; x++)
+//            dsty * dstStride + x = (pixel) (((blkSize - 1 - x) * lefty + (blkSize - 1 -y) * abovex + (x + 1) * topRight + (y + 1) * bottomLeft + blkSize) >> (log2Size + 1));
+//}
+
+    ldurb           w3, x2, #(1+16)               // topRight
+    ldurb           w4, x2, #(2*16+1+16)          // bottomLeft
+    ldr             q0, x2, #(2*16+1)             // v0 = leftx_b
+    ldr             q1, x2, #1                    // v1 = abovex_b
+    dup             v2.16b, w3                      // v2 = topRight_b
+    dup             v3.8h, w4                       // v3 = bottomLeft_h
+
+    adr             x4, tbl_const_1to8_7to0
+    ld2             {v4.2d, v5.2d}, x4            // v4 = const_b16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1
+    ext             v5.16b, v5.16b, v5.16b, #8      // v5 = const_b15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
+
+    ushll           v16.8h, v1.8b, #4               // v16,v17 = 16 * abovex
+    ushll2          v17.8h, v1.16b, #4
+    usubw           v6.8h, v3.8h, v1.8b             // v6,v7 = bottomLeft - abovex
+    usubw2          v7.8h, v3.8h, v1.16b
+
+    umlal           v16.8h, v4.8b, v2.8b            // v16,v17 = 16 * abovex + (x + 1) * topRight
+    umlal2          v17.8h, v4.16b, v2.16b
+
+    mov             w4, #16
+
+1:
+    dup             v1.16b, v0.b0                 // v1 = leftx_b
+    ext             v0.16b, v0.16b, v0.16b, #1
+
+    add             v16.8h, v16.8h, v6.8h           // v16,v17 = (blkSize - 1 -y=0) * abovex + (x + 1) * topRight + (y=0 + 1) * bottomLeft
+    add             v17.8h, v17.8h, v7.8h
+
+    mov             v18.16b, v16.16b
+    mov             v19.16b, v17.16b
+
+    umlal           v18.8h, v5.8b, v1.8b             // v3 = (blkSize - 1 - x) * lefty=0 + (blkSize - 1 -y=0) * abovex + (x + 1) * topRight + (y=0 + 1) * bottomLeft
+    umlal2          v19.8h, v5.16b, v1.16b
+    rshrn           v18.8b, v18.8h, #5
+    rshrn2          v18.16b, v19.8h, #5
+    st1             {v18.16b}, x0, x1
+    sub             w4, w4, #1
+    cbnz            w4, 1b
+
+    ret
+endfunc
​

x265_3.6.tar.gz/source/common/aarch64/loopfilter-prim.cpp -> x265_4.0.tar.gz/source/common/aarch64/loopfilter-prim.cpp Changed

@@ -1,3 +1,4 @@
+#include "common.h"
 #include "loopfilter-prim.h"
 
 #define PIXEL_MIN 0
@@ -11,15 +12,10 @@
 {
 
 
-/* get the sign of input variable (TODO: this is a dup, make common) */
-static inline int8_t signOf(int x)
-{
-    return (x >> 31) | ((int)((((uint32_t) - x)) >> 31));
-}
-
 static inline int8x8_t sign_diff_neon(const uint8x8_t in0, const uint8x8_t in1)
 {
-    int16x8_t in = vsubl_u8(in0, in1);
+    int16x8_t in = vreinterpretq_s16_u16(vsubl_u8(in0, in1));
+
     return vmovn_s16(vmaxq_s16(vminq_s16(in, vdupq_n_s16(1)), vdupq_n_s16(-1)));
 }
 
@@ -28,12 +24,13 @@
     int x = 0;
     for (; (x + 8) <= endX; x += 8)
     {
-        *(int8x8_t *)&dstx  = sign_diff_neon(*(uint8x8_t *)&src1x, *(uint8x8_t *)&src2x);
+        int8x8_t sign = sign_diff_neon(vld1_u8(src1 + x), vld1_u8(src2 + x));
+        vst1_s8(dst + x, sign);
     }
 
     for (; x < endX; x++)
     {
-        dstx = signOf(src1x - src2x);
+        dstx = x265_signOf(src1x - src2x);
     }
 }
 
@@ -56,21 +53,20 @@
             int8x8x2_t shifter;
             shifter.val10 = signLeft0;
             static const int8x8_t index = {8, 0, 1, 2, 3, 4, 5, 6};
-            int8x8_t tbl = *(int8x8_t *)offsetEo;
+            int8x8_t tbl = vld1_s8(offsetEo);
             for (; (x + 8) <= width; x += 8)
             {
-                uint8x8_t in = *(uint8x8_t *)&recx;
-                vsignRight = sign_diff_neon(in, *(uint8x8_t *)&recx + 1);
+                uint8x8_t in = vld1_u8(rec + x);
+                vsignRight = sign_diff_neon(in, vld1_u8(rec + x + 1));
                 shifter.val0 = vneg_s8(vsignRight);
                 int8x8_t tmp = shifter.val0;
                 int8x8_t edge = vtbl2_s8(shifter, index);
                 int8x8_t vedgeType = vadd_s8(vadd_s8(vsignRight, edge), vdup_n_s8(2));
                 shifter.val10 = tmp7;
                 int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType));
-                t1 = vaddw_u8(t1, in);
-                t1 = vmaxq_s16(t1, vdupq_n_s16(0));
-                t1 = vminq_s16(t1, vdupq_n_s16(255));
-                *(uint8x8_t *)&recx = vmovn_u16(t1);
+                t1 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(t1),
+                                                    in));
+                vst1_u8(rec + x, vqmovun_s16(t1));
             }
             signLeft0 = shifter.val10;
         }
@@ -93,22 +89,26 @@
 
     if (width >= 8)
     {
-        int8x8_t tbl = *(int8x8_t *)offsetEo;
+        int8x8_t tbl = vld1_s8(offsetEo);
+        const int8x8_t c = vdup_n_s8(2);
+
         for (; (x + 8) <= width; x += 8)
         {
-            uint8x8_t in0 = *(uint8x8_t *)&recx;
-            uint8x8_t in1 = *(uint8x8_t *)&recx + stride;
+            uint8x8_t in0 = vld1_u8(rec + x);
+            uint8x8_t in1 = vld1_u8(rec + x + stride);
             int8x8_t vsignDown = sign_diff_neon(in0, in1);
-            int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, *(int8x8_t *)&upBuff1x), vdup_n_s8(2));
-            *(int8x8_t *)&upBuff1x = vneg_s8(vsignDown);
+            int8x8_t vsignUp = vld1_s8(upBuff1 + x);
+            int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, vsignUp), c);
+            vst1_s8(upBuff1 + x, vneg_s8(vsignDown));
             int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType));
-            t1 = vaddw_u8(t1, in0);
-            *(uint8x8_t *)&recx = vqmovun_s16(t1);
+            t1 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(t1),
+                                                in0));
+            vst1_u8(rec + x, vqmovun_s16(t1));
         }
     }
     for (; x < width; x++)
     {
-        signDown = signOf(recx - recx + stride);
+        signDown = x265_signOf(recx - recx + stride);
         edgeType = signDown + upBuff1x + 2;
         upBuff1x = -signDown;
         recx = x265_clip(recx + offsetEoedgeType);
@@ -126,25 +126,26 @@
         int x = 0;
         if (width >= 8)
         {
-            int8x8_t tbl = *(int8x8_t *)offsetEo;
+            int8x8_t tbl = vld1_s8(offsetEo);
+            const int8x8_t c = vdup_n_s8(2);
+
             for (; (x + 8) <= width; x += 8)
             {
-                uint8x8_t in0 = *(uint8x8_t *)&recx;
-                uint8x8_t in1 = *(uint8x8_t *)&recx + stride;
+                uint8x8_t in0 = vld1_u8(rec + x);
+                uint8x8_t in1 = vld1_u8(rec + x + stride);
                 int8x8_t vsignDown = sign_diff_neon(in0, in1);
-                int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, *(int8x8_t *)&upBuff1x), vdup_n_s8(2));
-                *(int8x8_t *)&upBuff1x = vneg_s8(vsignDown);
+                int8x8_t vsignUp = vld1_s8(upBuff1 + x);
+                int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, vsignUp), c);
+                vst1_s8(upBuff1 + x, vneg_s8(vsignDown));
                 int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType));
-                t1 = vaddw_u8(t1, in0);
-                t1 = vmaxq_s16(t1, vdupq_n_s16(0));
-                t1 = vminq_s16(t1, vdupq_n_s16(255));
-                *(uint8x8_t *)&recx = vmovn_u16(t1);
-
+                t1 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(t1),
+                                                    in0));
+                vst1_u8(rec + x, vqmovun_s16(t1));
             }
         }
         for (; x < width; x++)
         {
-            signDown = signOf(recx - recx + stride);
+            signDown = x265_signOf(recx - recx + stride);
             edgeType = signDown + upBuff1x + 2;
             upBuff1x = -signDown;
             recx = x265_clip(recx + offsetEoedgeType);
@@ -157,11 +158,11 @@
 {
     int x;
 
-    if (abs(buff1 - bufft) < 16)
+    if (abs(static_cast<int>(buff1 - bufft)) < 16)
     {
         for (x = 0; x < width; x++)
         {
-            int8_t signDown = signOf(recx - recx + stride + 1);
+            int8_t signDown = x265_signOf(recx - recx + stride + 1);
             int edgeType = signDown + buff1x + 2;
             bufftx + 1 = -signDown;
             recx = x265_clip(recx + offsetEoedgeType);;
@@ -169,24 +170,26 @@
     }
     else
     {
-        int8x8_t tbl = *(int8x8_t *)offsetEo;
+        int8x8_t tbl = vld1_s8(offsetEo);
+        const int8x8_t c = vdup_n_s8(2);
+
         x = 0;
         for (; (x + 8) <= width; x += 8)
         {
-            uint8x8_t in0 = *(uint8x8_t *)&recx;
-            uint8x8_t in1 = *(uint8x8_t *)&recx + stride + 1;
+            uint8x8_t in0 = vld1_u8(rec + x);
+            uint8x8_t in1 = vld1_u8(rec + x + stride + 1);
             int8x8_t vsignDown = sign_diff_neon(in0, in1);
-            int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, *(int8x8_t *)&buff1x), vdup_n_s8(2));
-            *(int8x8_t *)&bufftx + 1 = vneg_s8(vsignDown);
+            int8x8_t vsignUp = vld1_s8(buff1 + x);
+            int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, vsignUp), c);
+            vst1_s8(bufft + x + 1, vneg_s8(vsignDown));
             int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType));
-            t1 = vaddw_u8(t1, in0);
-            t1 = vmaxq_s16(t1, vdupq_n_s16(0));
-            t1 = vminq_s16(t1, vdupq_n_s16(255));
-            *(uint8x8_t *)&recx = vmovn_u16(t1);
+            t1 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(t1),
+                                                in0));
+            vst1_u8(rec + x, vqmovun_s16(t1));
         }
         for (; x < width; x++)
         {
-            int8_t signDown = signOf(recx - recx + stride + 1);
+            int8_t signDown = x265_signOf(recx - recx + stride + 1);
             int edgeType = signDown + buff1x + 2;
             bufftx + 1 = -signDown;
             recx = x265_clip(recx + offsetEoedgeType);;
@@ -200,26 +203,25 @@
 {
     int8_t signDown;
     int8_t edgeType;
-    int8x8_t tbl = *(int8x8_t *)offsetEo;
+    int8x8_t tbl = vld1_s8(offsetEo);
+    const int8x8_t c = vdup_n_s8(2);
 
     int x = startX + 1;

 
@@ -1,3 +1,4 @@
+#include "common.h"
 #include "loopfilter-prim.h"
 
 #define PIXEL_MIN 0
@@ -11,15 +12,10 @@
 {
 
 
-/* get the sign of input variable (TODO: this is a dup, make common) */
-static inline int8_t signOf(int x)
-{
-    return (x >> 31) | ((int)((((uint32_t) - x)) >> 31));
-}
-
 static inline int8x8_t sign_diff_neon(const uint8x8_t in0, const uint8x8_t in1)
 {
-    int16x8_t in = vsubl_u8(in0, in1);
+    int16x8_t in = vreinterpretq_s16_u16(vsubl_u8(in0, in1));
+
     return vmovn_s16(vmaxq_s16(vminq_s16(in, vdupq_n_s16(1)), vdupq_n_s16(-1)));
 }
 
@@ -28,12 +24,13 @@
     int x = 0;
     for (; (x + 8) <= endX; x += 8)
     {
-        *(int8x8_t *)&dstx  = sign_diff_neon(*(uint8x8_t *)&src1x, *(uint8x8_t *)&src2x);
+        int8x8_t sign = sign_diff_neon(vld1_u8(src1 + x), vld1_u8(src2 + x));
+        vst1_s8(dst + x, sign);
     }
 
     for (; x < endX; x++)
     {
-        dstx = signOf(src1x - src2x);
+        dstx = x265_signOf(src1x - src2x);
     }
 }
 
@@ -56,21 +53,20 @@
             int8x8x2_t shifter;
             shifter.val10 = signLeft0;
             static const int8x8_t index = {8, 0, 1, 2, 3, 4, 5, 6};
-            int8x8_t tbl = *(int8x8_t *)offsetEo;
+            int8x8_t tbl = vld1_s8(offsetEo);
             for (; (x + 8) <= width; x += 8)
             {
-                uint8x8_t in = *(uint8x8_t *)&recx;
-                vsignRight = sign_diff_neon(in, *(uint8x8_t *)&recx + 1);
+                uint8x8_t in = vld1_u8(rec + x);
+                vsignRight = sign_diff_neon(in, vld1_u8(rec + x + 1));
                 shifter.val0 = vneg_s8(vsignRight);
                 int8x8_t tmp = shifter.val0;
                 int8x8_t edge = vtbl2_s8(shifter, index);
                 int8x8_t vedgeType = vadd_s8(vadd_s8(vsignRight, edge), vdup_n_s8(2));
                 shifter.val10 = tmp7;
                 int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType));
-                t1 = vaddw_u8(t1, in);
-                t1 = vmaxq_s16(t1, vdupq_n_s16(0));
-                t1 = vminq_s16(t1, vdupq_n_s16(255));
-                *(uint8x8_t *)&recx = vmovn_u16(t1);
+                t1 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(t1),
+                                                    in));
+                vst1_u8(rec + x, vqmovun_s16(t1));
             }
             signLeft0 = shifter.val10;
         }
@@ -93,22 +89,26 @@
 
     if (width >= 8)
     {
-        int8x8_t tbl = *(int8x8_t *)offsetEo;
+        int8x8_t tbl = vld1_s8(offsetEo);
+        const int8x8_t c = vdup_n_s8(2);
+
         for (; (x + 8) <= width; x += 8)
         {
-            uint8x8_t in0 = *(uint8x8_t *)&recx;
-            uint8x8_t in1 = *(uint8x8_t *)&recx + stride;
+            uint8x8_t in0 = vld1_u8(rec + x);
+            uint8x8_t in1 = vld1_u8(rec + x + stride);
             int8x8_t vsignDown = sign_diff_neon(in0, in1);
-            int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, *(int8x8_t *)&upBuff1x), vdup_n_s8(2));
-            *(int8x8_t *)&upBuff1x = vneg_s8(vsignDown);
+            int8x8_t vsignUp = vld1_s8(upBuff1 + x);
+            int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, vsignUp), c);
+            vst1_s8(upBuff1 + x, vneg_s8(vsignDown));
             int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType));
-            t1 = vaddw_u8(t1, in0);
-            *(uint8x8_t *)&recx = vqmovun_s16(t1);
+            t1 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(t1),
+                                                in0));
+            vst1_u8(rec + x, vqmovun_s16(t1));
         }
     }
     for (; x < width; x++)
     {
-        signDown = signOf(recx - recx + stride);
+        signDown = x265_signOf(recx - recx + stride);
         edgeType = signDown + upBuff1x + 2;
         upBuff1x = -signDown;
         recx = x265_clip(recx + offsetEoedgeType);
@@ -126,25 +126,26 @@
         int x = 0;
         if (width >= 8)
         {
-            int8x8_t tbl = *(int8x8_t *)offsetEo;
+            int8x8_t tbl = vld1_s8(offsetEo);
+            const int8x8_t c = vdup_n_s8(2);
+
             for (; (x + 8) <= width; x += 8)
             {
-                uint8x8_t in0 = *(uint8x8_t *)&recx;
-                uint8x8_t in1 = *(uint8x8_t *)&recx + stride;
+                uint8x8_t in0 = vld1_u8(rec + x);
+                uint8x8_t in1 = vld1_u8(rec + x + stride);
                 int8x8_t vsignDown = sign_diff_neon(in0, in1);
-                int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, *(int8x8_t *)&upBuff1x), vdup_n_s8(2));
-                *(int8x8_t *)&upBuff1x = vneg_s8(vsignDown);
+                int8x8_t vsignUp = vld1_s8(upBuff1 + x);
+                int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, vsignUp), c);
+                vst1_s8(upBuff1 + x, vneg_s8(vsignDown));
                 int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType));
-                t1 = vaddw_u8(t1, in0);
-                t1 = vmaxq_s16(t1, vdupq_n_s16(0));
-                t1 = vminq_s16(t1, vdupq_n_s16(255));
-                *(uint8x8_t *)&recx = vmovn_u16(t1);
-
+                t1 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(t1),
+                                                    in0));
+                vst1_u8(rec + x, vqmovun_s16(t1));
             }
         }
         for (; x < width; x++)
         {
-            signDown = signOf(recx - recx + stride);
+            signDown = x265_signOf(recx - recx + stride);
             edgeType = signDown + upBuff1x + 2;
             upBuff1x = -signDown;
             recx = x265_clip(recx + offsetEoedgeType);
@@ -157,11 +158,11 @@
 {
     int x;
 
-    if (abs(buff1 - bufft) < 16)
+    if (abs(static_cast<int>(buff1 - bufft)) < 16)
     {
         for (x = 0; x < width; x++)
         {
-            int8_t signDown = signOf(recx - recx + stride + 1);
+            int8_t signDown = x265_signOf(recx - recx + stride + 1);
             int edgeType = signDown + buff1x + 2;
             bufftx + 1 = -signDown;
             recx = x265_clip(recx + offsetEoedgeType);;
@@ -169,24 +170,26 @@
     }
     else
     {
-        int8x8_t tbl = *(int8x8_t *)offsetEo;
+        int8x8_t tbl = vld1_s8(offsetEo);
+        const int8x8_t c = vdup_n_s8(2);
+
         x = 0;
         for (; (x + 8) <= width; x += 8)
         {
-            uint8x8_t in0 = *(uint8x8_t *)&recx;
-            uint8x8_t in1 = *(uint8x8_t *)&recx + stride + 1;
+            uint8x8_t in0 = vld1_u8(rec + x);
+            uint8x8_t in1 = vld1_u8(rec + x + stride + 1);
             int8x8_t vsignDown = sign_diff_neon(in0, in1);
-            int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, *(int8x8_t *)&buff1x), vdup_n_s8(2));
-            *(int8x8_t *)&bufftx + 1 = vneg_s8(vsignDown);
+            int8x8_t vsignUp = vld1_s8(buff1 + x);
+            int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, vsignUp), c);
+            vst1_s8(bufft + x + 1, vneg_s8(vsignDown));
             int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType));
-            t1 = vaddw_u8(t1, in0);
-            t1 = vmaxq_s16(t1, vdupq_n_s16(0));
-            t1 = vminq_s16(t1, vdupq_n_s16(255));
-            *(uint8x8_t *)&recx = vmovn_u16(t1);
+            t1 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(t1),
+                                                in0));
+            vst1_u8(rec + x, vqmovun_s16(t1));
         }
         for (; x < width; x++)
         {
-            int8_t signDown = signOf(recx - recx + stride + 1);
+            int8_t signDown = x265_signOf(recx - recx + stride + 1);
             int edgeType = signDown + buff1x + 2;
             bufftx + 1 = -signDown;
             recx = x265_clip(recx + offsetEoedgeType);;
@@ -200,26 +203,25 @@
 {
     int8_t signDown;
     int8_t edgeType;
-    int8x8_t tbl = *(int8x8_t *)offsetEo;
+    int8x8_t tbl = vld1_s8(offsetEo);
+    const int8x8_t c = vdup_n_s8(2);
 
     int x = startX + 1;
​

x265_3.6.tar.gz/source/common/aarch64/mc-a-sve2.S -> x265_4.0.tar.gz/source/common/aarch64/mc-a-sve2.S Changed

@@ -219,7 +219,7 @@
     mov             x11, #0
     whilelt         p0.b, x11, x10
     mov             w12, #8
-.loop_gt_32_pixel_avg_pp_48x64:
+.Loop_gt_32_pixel_avg_pp_48x64:
     sub             w12, w12, #1
 .rept 8
     ld1b            {z0.b}, p0/z, x2
@@ -230,7 +230,7 @@
     st1b            {z0.b}, p0, x0
     add             x0, x0, x1
 .endr
-    cbnz            w12, .loop_gt_32_pixel_avg_pp_48x64
+    cbnz            w12, .Loop_gt_32_pixel_avg_pp_48x64
     ret
 endfunc
 
@@ -339,7 +339,7 @@
     mov             w12, #\h / 2
     ptrue           p0.b, vl16
     ptrue           p2.h, vl6
-.loop_sve2_addavg_6x\h\():
+.Loop_sve2_addavg_6x\h\():
     sub             w12, w12, #1
     ld1b            {z0.b}, p0/z, x0
     ld1b            {z1.b}, p0/z, x1
@@ -359,7 +359,7 @@
     add             x2, x2, x5
     st1b            {z2.h}, p2, x2
     add             x2, x2, x5
-    cbnz            w12, .loop_sve2_addavg_6x\h
+    cbnz            w12, .Loop_sve2_addavg_6x\h
     ret
 endfunc
 .endm
@@ -398,7 +398,7 @@
 function PFX(addAvg_8x\h\()_sve2)
     mov             w12, #\h / 2
     ptrue           p0.b, vl16
-.loop_sve2_addavg_8x\h\():
+.Loop_sve2_addavg_8x\h\():
     sub             w12, w12, #1
     ld1b            {z0.b}, p0/z, x0
     ld1b            {z1.b}, p0/z, x1
@@ -418,7 +418,7 @@
     add             x2, x2, x5
     st1b            {z2.h}, p0, x2
     add             x2, x2, x5
-    cbnz            w12, .loop_sve2_addavg_8x\h
+    cbnz            w12, .Loop_sve2_addavg_8x\h
     ret
 endfunc
 .endm
@@ -440,7 +440,7 @@
     bgt             .vl_gt_16_addAvg_12x\h
     ptrue           p0.b, vl16
     ptrue           p1.b, vl8
-.loop_sve2_addavg_12x\h\():
+.Loop_sve2_addavg_12x\h\():
     sub             w12, w12, #1
     ld1b            {z0.b}, p0/z, x0
     ld1b            {z1.b}, p0/z, x1
@@ -457,13 +457,13 @@
     st1b            {z0.h}, p0, x2
     st1b            {z2.h}, p1, x2, #1, mul vl
     add             x2, x2, x5
-    cbnz            w12, .loop_sve2_addavg_12x\h
+    cbnz            w12, .Loop_sve2_addavg_12x\h
     ret
 .vl_gt_16_addAvg_12x\h\():
     mov             x10, #24
     mov             x11, #0
     whilelt         p0.b, x11, x10
-.loop_sve2_gt_16_addavg_12x\h\():
+.Loop_sve2_gt_16_addavg_12x\h\():
     sub             w12, w12, #1
     ld1b            {z0.b}, p0/z, x0
     ld1b            {z1.b}, p0/z, x1
@@ -476,7 +476,7 @@
     add             z2.b, z2.b, #0x80
     st1b            {z0.h}, p0, x2
     add             x2, x2, x5
-    cbnz            w12, .loop_sve2_gt_16_addavg_12x\h
+    cbnz            w12, .Loop_sve2_gt_16_addavg_12x\h
     ret
 endfunc
 .endm
@@ -491,7 +491,7 @@
     cmp             x9, #16
     bgt             .vl_gt_16_addAvg_16x\h
     ptrue           p0.b, vl16
-.loop_eq_16_sve2_addavg_16x\h\():
+.Loop_eq_16_sve2_addavg_16x\h\():
     sub             w12, w12, #1
     ld1b            {z0.b}, p0/z, x0
     ld1b            {z1.b}, p0/z, x1
@@ -508,13 +508,13 @@
     st1b            {z0.h}, p0, x2
     st1b            {z2.h}, p0, x2, #1, mul vl
     add             x2, x2, x5
-    cbnz            w12, .loop_eq_16_sve2_addavg_16x\h
+    cbnz            w12, .Loop_eq_16_sve2_addavg_16x\h
     ret
 .vl_gt_16_addAvg_16x\h\():
     cmp             x9, #32
     bgt             .vl_gt_32_addAvg_16x\h
     ptrue           p0.b, vl32
-.loop_gt_16_sve2_addavg_16x\h\():
+.Loop_gt_16_sve2_addavg_16x\h\():
     sub             w12, w12, #1
     ld1b            {z0.b}, p0/z, x0
     ld1b            {z1.b}, p0/z, x1
@@ -525,13 +525,13 @@
     add             z0.b, z0.b, #0x80
     st1b            {z0.h}, p1, x2
     add             x2, x2, x5
-    cbnz            w12, .loop_gt_16_sve2_addavg_16x\h
+    cbnz            w12, .Loop_gt_16_sve2_addavg_16x\h
     ret
 .vl_gt_32_addAvg_16x\h\():
     mov             x10, #48
     mov             x11, #0
     whilelt         p0.b, x11, x10
-.loop_gt_32_sve2_addavg_16x\h\():
+.Loop_gt_32_sve2_addavg_16x\h\():
     sub             w12, w12, #1
     ld1b            {z0.b}, p0/z, x0
     add             x0, x0, x3, lsl #1
@@ -541,7 +541,7 @@
     add             z0.b, z0.b, #0x80
     st1b            {z0.h}, p0, x2
     add             x2, x2, x5
-    cbnz            w12, .loop_gt_32_sve2_addavg_16x\h
+    cbnz            w12, .Loop_gt_32_sve2_addavg_16x\h
     ret
 endfunc
 .endm
@@ -561,7 +561,7 @@
     cmp             x9, #16
     bgt             .vl_gt_16_addAvg_24x\h
     addAvg_start
-.loop_eq_16_sve2_addavg_24x\h\():
+.Loop_eq_16_sve2_addavg_24x\h\():
     sub             w12, w12, #1
     ld1             {v0.16b-v2.16b}, x0, x3
     ld1             {v3.16b-v5.16b}, x1, x4
@@ -572,14 +572,14 @@
     sqxtun          v1.8b, v1.8h
     sqxtun          v2.8b, v2.8h
     st1             {v0.8b-v2.8b}, x2, x5
-    cbnz            w12, .loop_eq_16_sve2_addavg_24x\h
+    cbnz            w12, .Loop_eq_16_sve2_addavg_24x\h
     ret
 .vl_gt_16_addAvg_24x\h\():
     cmp             x9, #48
     bgt             .vl_gt_48_addAvg_24x\h
     ptrue           p0.b, vl32
     ptrue           p1.b, vl16
-.loop_gt_16_sve2_addavg_24x\h\():
+.Loop_gt_16_sve2_addavg_24x\h\():
     sub             w12, w12, #1
     ld1b            {z0.b}, p0/z, x0
     ld1b            {z1.b}, p1/z, x0, #1, mul vl
@@ -596,13 +596,13 @@
     st1b            {z0.h}, p0, x2
     st1b            {z1.h}, p1, x2, #1, mul vl
     add             x2, x2, x5
-    cbnz            w12, .loop_gt_16_sve2_addavg_24x\h
+    cbnz            w12, .Loop_gt_16_sve2_addavg_24x\h
     ret
 .vl_gt_48_addAvg_24x\h\():
     mov             x10, #48
     mov             x11, #0
     whilelt         p0.b, x11, x10
-.loop_gt_48_sve2_addavg_24x\h\():
+.Loop_gt_48_sve2_addavg_24x\h\():
     sub             w12, w12, #1
     ld1b            {z0.b}, p0/z, x0
     ld1b            {z2.b}, p0/z, x1
@@ -613,7 +613,7 @@
     add             z0.b, z0.b, #0x80
     st1b            {z0.h}, p0, x2
     add             x2, x2, x5
-    cbnz            w12, .loop_gt_48_sve2_addavg_24x\h
+    cbnz            w12, .Loop_gt_48_sve2_addavg_24x\h
     ret
 endfunc
 .endm
@@ -628,7 +628,7 @@
     cmp             x9, #16
     bgt             .vl_gt_16_addAvg_32x\h
     ptrue           p0.b, vl16
-.loop_eq_16_sve2_addavg_32x\h\():
+.Loop_eq_16_sve2_addavg_32x\h\():
     sub             w12, w12, #1
     ld1b            {z0.b}, p0/z, x0
     ld1b            {z1.b}, p0/z, x0, #1, mul vl
@@ -657,13 +657,13 @@
     st1b            {z2.h}, p0, x2, #2, mul vl

 
@@ -219,7 +219,7 @@
     mov             x11, #0
     whilelt         p0.b, x11, x10
     mov             w12, #8
-.loop_gt_32_pixel_avg_pp_48x64:
+.Loop_gt_32_pixel_avg_pp_48x64:
     sub             w12, w12, #1
 .rept 8
     ld1b            {z0.b}, p0/z, x2
@@ -230,7 +230,7 @@
     st1b            {z0.b}, p0, x0
     add             x0, x0, x1
 .endr
-    cbnz            w12, .loop_gt_32_pixel_avg_pp_48x64
+    cbnz            w12, .Loop_gt_32_pixel_avg_pp_48x64
     ret
 endfunc
 
@@ -339,7 +339,7 @@
     mov             w12, #\h / 2
     ptrue           p0.b, vl16
     ptrue           p2.h, vl6
-.loop_sve2_addavg_6x\h\():
+.Loop_sve2_addavg_6x\h\():
     sub             w12, w12, #1
     ld1b            {z0.b}, p0/z, x0
     ld1b            {z1.b}, p0/z, x1
@@ -359,7 +359,7 @@
     add             x2, x2, x5
     st1b            {z2.h}, p2, x2
     add             x2, x2, x5
-    cbnz            w12, .loop_sve2_addavg_6x\h
+    cbnz            w12, .Loop_sve2_addavg_6x\h
     ret
 endfunc
 .endm
@@ -398,7 +398,7 @@
 function PFX(addAvg_8x\h\()_sve2)
     mov             w12, #\h / 2
     ptrue           p0.b, vl16
-.loop_sve2_addavg_8x\h\():
+.Loop_sve2_addavg_8x\h\():
     sub             w12, w12, #1
     ld1b            {z0.b}, p0/z, x0
     ld1b            {z1.b}, p0/z, x1
@@ -418,7 +418,7 @@
     add             x2, x2, x5
     st1b            {z2.h}, p0, x2
     add             x2, x2, x5
-    cbnz            w12, .loop_sve2_addavg_8x\h
+    cbnz            w12, .Loop_sve2_addavg_8x\h
     ret
 endfunc
 .endm
@@ -440,7 +440,7 @@
     bgt             .vl_gt_16_addAvg_12x\h
     ptrue           p0.b, vl16
     ptrue           p1.b, vl8
-.loop_sve2_addavg_12x\h\():
+.Loop_sve2_addavg_12x\h\():
     sub             w12, w12, #1
     ld1b            {z0.b}, p0/z, x0
     ld1b            {z1.b}, p0/z, x1
@@ -457,13 +457,13 @@
     st1b            {z0.h}, p0, x2
     st1b            {z2.h}, p1, x2, #1, mul vl
     add             x2, x2, x5
-    cbnz            w12, .loop_sve2_addavg_12x\h
+    cbnz            w12, .Loop_sve2_addavg_12x\h
     ret
 .vl_gt_16_addAvg_12x\h\():
     mov             x10, #24
     mov             x11, #0
     whilelt         p0.b, x11, x10
-.loop_sve2_gt_16_addavg_12x\h\():
+.Loop_sve2_gt_16_addavg_12x\h\():
     sub             w12, w12, #1
     ld1b            {z0.b}, p0/z, x0
     ld1b            {z1.b}, p0/z, x1
@@ -476,7 +476,7 @@
     add             z2.b, z2.b, #0x80
     st1b            {z0.h}, p0, x2
     add             x2, x2, x5
-    cbnz            w12, .loop_sve2_gt_16_addavg_12x\h
+    cbnz            w12, .Loop_sve2_gt_16_addavg_12x\h
     ret
 endfunc
 .endm
@@ -491,7 +491,7 @@
     cmp             x9, #16
     bgt             .vl_gt_16_addAvg_16x\h
     ptrue           p0.b, vl16
-.loop_eq_16_sve2_addavg_16x\h\():
+.Loop_eq_16_sve2_addavg_16x\h\():
     sub             w12, w12, #1
     ld1b            {z0.b}, p0/z, x0
     ld1b            {z1.b}, p0/z, x1
@@ -508,13 +508,13 @@
     st1b            {z0.h}, p0, x2
     st1b            {z2.h}, p0, x2, #1, mul vl
     add             x2, x2, x5
-    cbnz            w12, .loop_eq_16_sve2_addavg_16x\h
+    cbnz            w12, .Loop_eq_16_sve2_addavg_16x\h
     ret
 .vl_gt_16_addAvg_16x\h\():
     cmp             x9, #32
     bgt             .vl_gt_32_addAvg_16x\h
     ptrue           p0.b, vl32
-.loop_gt_16_sve2_addavg_16x\h\():
+.Loop_gt_16_sve2_addavg_16x\h\():
     sub             w12, w12, #1
     ld1b            {z0.b}, p0/z, x0
     ld1b            {z1.b}, p0/z, x1
@@ -525,13 +525,13 @@
     add             z0.b, z0.b, #0x80
     st1b            {z0.h}, p1, x2
     add             x2, x2, x5
-    cbnz            w12, .loop_gt_16_sve2_addavg_16x\h
+    cbnz            w12, .Loop_gt_16_sve2_addavg_16x\h
     ret
 .vl_gt_32_addAvg_16x\h\():
     mov             x10, #48
     mov             x11, #0
     whilelt         p0.b, x11, x10
-.loop_gt_32_sve2_addavg_16x\h\():
+.Loop_gt_32_sve2_addavg_16x\h\():
     sub             w12, w12, #1
     ld1b            {z0.b}, p0/z, x0
     add             x0, x0, x3, lsl #1
@@ -541,7 +541,7 @@
     add             z0.b, z0.b, #0x80
     st1b            {z0.h}, p0, x2
     add             x2, x2, x5
-    cbnz            w12, .loop_gt_32_sve2_addavg_16x\h
+    cbnz            w12, .Loop_gt_32_sve2_addavg_16x\h
     ret
 endfunc
 .endm
@@ -561,7 +561,7 @@
     cmp             x9, #16
     bgt             .vl_gt_16_addAvg_24x\h
     addAvg_start
-.loop_eq_16_sve2_addavg_24x\h\():
+.Loop_eq_16_sve2_addavg_24x\h\():
     sub             w12, w12, #1
     ld1             {v0.16b-v2.16b}, x0, x3
     ld1             {v3.16b-v5.16b}, x1, x4
@@ -572,14 +572,14 @@
     sqxtun          v1.8b, v1.8h
     sqxtun          v2.8b, v2.8h
     st1             {v0.8b-v2.8b}, x2, x5
-    cbnz            w12, .loop_eq_16_sve2_addavg_24x\h
+    cbnz            w12, .Loop_eq_16_sve2_addavg_24x\h
     ret
 .vl_gt_16_addAvg_24x\h\():
     cmp             x9, #48
     bgt             .vl_gt_48_addAvg_24x\h
     ptrue           p0.b, vl32
     ptrue           p1.b, vl16
-.loop_gt_16_sve2_addavg_24x\h\():
+.Loop_gt_16_sve2_addavg_24x\h\():
     sub             w12, w12, #1
     ld1b            {z0.b}, p0/z, x0
     ld1b            {z1.b}, p1/z, x0, #1, mul vl
@@ -596,13 +596,13 @@
     st1b            {z0.h}, p0, x2
     st1b            {z1.h}, p1, x2, #1, mul vl
     add             x2, x2, x5
-    cbnz            w12, .loop_gt_16_sve2_addavg_24x\h
+    cbnz            w12, .Loop_gt_16_sve2_addavg_24x\h
     ret
 .vl_gt_48_addAvg_24x\h\():
     mov             x10, #48
     mov             x11, #0
     whilelt         p0.b, x11, x10
-.loop_gt_48_sve2_addavg_24x\h\():
+.Loop_gt_48_sve2_addavg_24x\h\():
     sub             w12, w12, #1
     ld1b            {z0.b}, p0/z, x0
     ld1b            {z2.b}, p0/z, x1
@@ -613,7 +613,7 @@
     add             z0.b, z0.b, #0x80
     st1b            {z0.h}, p0, x2
     add             x2, x2, x5
-    cbnz            w12, .loop_gt_48_sve2_addavg_24x\h
+    cbnz            w12, .Loop_gt_48_sve2_addavg_24x\h
     ret
 endfunc
 .endm
@@ -628,7 +628,7 @@
     cmp             x9, #16
     bgt             .vl_gt_16_addAvg_32x\h
     ptrue           p0.b, vl16
-.loop_eq_16_sve2_addavg_32x\h\():
+.Loop_eq_16_sve2_addavg_32x\h\():
     sub             w12, w12, #1
     ld1b            {z0.b}, p0/z, x0
     ld1b            {z1.b}, p0/z, x0, #1, mul vl
@@ -657,13 +657,13 @@
     st1b            {z2.h}, p0, x2, #2, mul vl
​

x265_3.6.tar.gz/source/common/aarch64/mc-a.S -> x265_4.0.tar.gz/source/common/aarch64/mc-a.S Changed

@@ -283,7 +283,7 @@
     addAvg_start
     mov             w12, #\h / 2
     sub             x5, x5, #4
-.loop_addavg_6x\h:
+.Loop_addavg_6x\h:
     sub             w12, w12, #1
     ld1             {v0.16b}, x0, x3
     ld1             {v1.16b}, x1, x4
@@ -305,7 +305,7 @@
     st1             {v0.h}2, x2, x5
     str             s1, x2, #4
     st1             {v1.h}2, x2, x5
-    cbnz            w12, .loop_addavg_6x\h
+    cbnz            w12, .Loop_addavg_6x\h
     ret
 endfunc
 .endm
@@ -344,7 +344,7 @@
 function PFX(addAvg_8x\h\()_neon)
     addAvg_start
     mov             w12, #\h / 2
-.loop_addavg_8x\h:
+.Loop_addavg_8x\h:
     sub             w12, w12, #1
     ld1             {v0.16b}, x0, x3
     ld1             {v1.16b}, x1, x4
@@ -364,7 +364,7 @@
     sqxtun          v1.8b, v1.8h
     st1             {v0.8b}, x2, x5
     st1             {v1.8b}, x2, x5
-    cbnz            w12, .loop_addavg_8x\h
+    cbnz            w12, .Loop_addavg_8x\h
     ret
 endfunc
 .endm
@@ -385,7 +385,7 @@
     sub             x4, x4, #16
     sub             x5, x5, #8
     mov             w12, #\h
-.loop_addAvg_12X\h\():
+.Loop_addAvg_12X\h\():
     sub             w12, w12, #1
     ld1             {v0.16b}, x0, #16
     ld1             {v1.16b}, x1, #16
@@ -403,7 +403,7 @@
     sqxtun          v1.8b, v1.8h
     st1             {v0.8b}, x2, #8
     st1             {v1.s}0, x2, x5
-    cbnz            w12, .loop_addAvg_12X\h
+    cbnz            w12, .Loop_addAvg_12X\h
     ret
 endfunc
 .endm
@@ -415,7 +415,7 @@
 function PFX(addAvg_16x\h\()_neon)
     addAvg_start
     mov             w12, #\h
-.loop_addavg_16x\h:
+.Loop_addavg_16x\h:
     sub             w12, w12, #1
     ld1             {v0.8h-v1.8h}, x0, x3
     ld1             {v2.8h-v3.8h}, x1, x4
@@ -424,7 +424,7 @@
     sqxtun          v0.8b, v0.8h
     sqxtun2         v0.16b, v1.8h
     st1             {v0.16b}, x2, x5
-    cbnz            w12, .loop_addavg_16x\h
+    cbnz            w12, .Loop_addavg_16x\h
     ret
 endfunc
 .endm
@@ -441,7 +441,7 @@
 function PFX(addAvg_24x\h\()_neon)
     addAvg_start
     mov             w12, #\h
-.loop_addavg_24x\h\():
+.Loop_addavg_24x\h\():
     sub             w12, w12, #1
     ld1             {v0.16b-v2.16b}, x0, x3
     ld1             {v3.16b-v5.16b}, x1, x4
@@ -452,7 +452,7 @@
     sqxtun          v1.8b, v1.8h
     sqxtun          v2.8b, v2.8h
     st1             {v0.8b-v2.8b}, x2, x5
-    cbnz            w12, .loop_addavg_24x\h
+    cbnz            w12, .Loop_addavg_24x\h
     ret
 endfunc
 .endm
@@ -464,7 +464,7 @@
 function PFX(addAvg_32x\h\()_neon)
     addAvg_start
     mov             w12, #\h
-.loop_addavg_32x\h\():
+.Loop_addavg_32x\h\():
     sub             w12, w12, #1
     ld1             {v0.8h-v3.8h}, x0, x3
     ld1             {v4.8h-v7.8h}, x1, x4
@@ -477,7 +477,7 @@
     sqxtun          v2.8b, v2.8h
     sqxtun          v3.8b, v3.8h
     st1             {v0.8b-v3.8b}, x2, x5
-    cbnz            w12, .loop_addavg_32x\h
+    cbnz            w12, .Loop_addavg_32x\h
     ret
 endfunc
 .endm
@@ -494,7 +494,7 @@
     sub             x3, x3, #64
     sub             x4, x4, #64
     mov             w12, #64
-.loop_addavg_48x64:
+.Loop_addavg_48x64:
     sub             w12, w12, #1
     ld1             {v0.8h-v3.8h}, x0, #64
     ld1             {v4.8h-v7.8h}, x1, #64
@@ -513,7 +513,7 @@
     sqxtun          v2.8b, v20.8h
     sqxtun2         v2.16b, v21.8h
     st1             {v0.16b-v2.16b}, x2, x5
-    cbnz            w12, .loop_addavg_48x64
+    cbnz            w12, .Loop_addavg_48x64
     ret
 endfunc
 
@@ -523,7 +523,7 @@
     mov             w12, #\h
     sub             x3, x3, #64
     sub             x4, x4, #64
-.loop_addavg_64x\h\():
+.Loop_addavg_64x\h\():
     sub             w12, w12, #1
     ld1             {v0.8h-v3.8h}, x0, #64
     ld1             {v4.8h-v7.8h}, x1, #64
@@ -546,7 +546,7 @@
     sqxtun          v3.8b, v22.8h
     sqxtun2         v3.16b, v23.8h
     st1             {v0.16b-v3.16b}, x2, x5
-    cbnz            w12, .loop_addavg_64x\h
+    cbnz            w12, .Loop_addavg_64x\h
     ret
 endfunc
 .endm

 
@@ -283,7 +283,7 @@
     addAvg_start
     mov             w12, #\h / 2
     sub             x5, x5, #4
-.loop_addavg_6x\h:
+.Loop_addavg_6x\h:
     sub             w12, w12, #1
     ld1             {v0.16b}, x0, x3
     ld1             {v1.16b}, x1, x4
@@ -305,7 +305,7 @@
     st1             {v0.h}2, x2, x5
     str             s1, x2, #4
     st1             {v1.h}2, x2, x5
-    cbnz            w12, .loop_addavg_6x\h
+    cbnz            w12, .Loop_addavg_6x\h
     ret
 endfunc
 .endm
@@ -344,7 +344,7 @@
 function PFX(addAvg_8x\h\()_neon)
     addAvg_start
     mov             w12, #\h / 2
-.loop_addavg_8x\h:
+.Loop_addavg_8x\h:
     sub             w12, w12, #1
     ld1             {v0.16b}, x0, x3
     ld1             {v1.16b}, x1, x4
@@ -364,7 +364,7 @@
     sqxtun          v1.8b, v1.8h
     st1             {v0.8b}, x2, x5
     st1             {v1.8b}, x2, x5
-    cbnz            w12, .loop_addavg_8x\h
+    cbnz            w12, .Loop_addavg_8x\h
     ret
 endfunc
 .endm
@@ -385,7 +385,7 @@
     sub             x4, x4, #16
     sub             x5, x5, #8
     mov             w12, #\h
-.loop_addAvg_12X\h\():
+.Loop_addAvg_12X\h\():
     sub             w12, w12, #1
     ld1             {v0.16b}, x0, #16
     ld1             {v1.16b}, x1, #16
@@ -403,7 +403,7 @@
     sqxtun          v1.8b, v1.8h
     st1             {v0.8b}, x2, #8
     st1             {v1.s}0, x2, x5
-    cbnz            w12, .loop_addAvg_12X\h
+    cbnz            w12, .Loop_addAvg_12X\h
     ret
 endfunc
 .endm
@@ -415,7 +415,7 @@
 function PFX(addAvg_16x\h\()_neon)
     addAvg_start
     mov             w12, #\h
-.loop_addavg_16x\h:
+.Loop_addavg_16x\h:
     sub             w12, w12, #1
     ld1             {v0.8h-v1.8h}, x0, x3
     ld1             {v2.8h-v3.8h}, x1, x4
@@ -424,7 +424,7 @@
     sqxtun          v0.8b, v0.8h
     sqxtun2         v0.16b, v1.8h
     st1             {v0.16b}, x2, x5
-    cbnz            w12, .loop_addavg_16x\h
+    cbnz            w12, .Loop_addavg_16x\h
     ret
 endfunc
 .endm
@@ -441,7 +441,7 @@
 function PFX(addAvg_24x\h\()_neon)
     addAvg_start
     mov             w12, #\h
-.loop_addavg_24x\h\():
+.Loop_addavg_24x\h\():
     sub             w12, w12, #1
     ld1             {v0.16b-v2.16b}, x0, x3
     ld1             {v3.16b-v5.16b}, x1, x4
@@ -452,7 +452,7 @@
     sqxtun          v1.8b, v1.8h
     sqxtun          v2.8b, v2.8h
     st1             {v0.8b-v2.8b}, x2, x5
-    cbnz            w12, .loop_addavg_24x\h
+    cbnz            w12, .Loop_addavg_24x\h
     ret
 endfunc
 .endm
@@ -464,7 +464,7 @@
 function PFX(addAvg_32x\h\()_neon)
     addAvg_start
     mov             w12, #\h
-.loop_addavg_32x\h\():
+.Loop_addavg_32x\h\():
     sub             w12, w12, #1
     ld1             {v0.8h-v3.8h}, x0, x3
     ld1             {v4.8h-v7.8h}, x1, x4
@@ -477,7 +477,7 @@
     sqxtun          v2.8b, v2.8h
     sqxtun          v3.8b, v3.8h
     st1             {v0.8b-v3.8b}, x2, x5
-    cbnz            w12, .loop_addavg_32x\h
+    cbnz            w12, .Loop_addavg_32x\h
     ret
 endfunc
 .endm
@@ -494,7 +494,7 @@
     sub             x3, x3, #64
     sub             x4, x4, #64
     mov             w12, #64
-.loop_addavg_48x64:
+.Loop_addavg_48x64:
     sub             w12, w12, #1
     ld1             {v0.8h-v3.8h}, x0, #64
     ld1             {v4.8h-v7.8h}, x1, #64
@@ -513,7 +513,7 @@
     sqxtun          v2.8b, v20.8h
     sqxtun2         v2.16b, v21.8h
     st1             {v0.16b-v2.16b}, x2, x5
-    cbnz            w12, .loop_addavg_48x64
+    cbnz            w12, .Loop_addavg_48x64
     ret
 endfunc
 
@@ -523,7 +523,7 @@
     mov             w12, #\h
     sub             x3, x3, #64
     sub             x4, x4, #64
-.loop_addavg_64x\h\():
+.Loop_addavg_64x\h\():
     sub             w12, w12, #1
     ld1             {v0.8h-v3.8h}, x0, #64
     ld1             {v4.8h-v7.8h}, x1, #64
@@ -546,7 +546,7 @@
     sqxtun          v3.8b, v22.8h
     sqxtun2         v3.16b, v23.8h
     st1             {v0.16b-v3.16b}, x2, x5
-    cbnz            w12, .loop_addavg_64x\h
+    cbnz            w12, .Loop_addavg_64x\h
     ret
 endfunc
 .endm
​

x265_4.0.tar.gz/source/common/aarch64/mem-neon.h Added

@@ -0,0 +1,268 @@
+/*****************************************************************************
+ * Copyright (C) 2024 MulticoreWare, Inc
+ *
+ * Authors: Hari Limaye <hari.limaye@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_COMMON_AARCH64_MEM_NEON_H
+#define X265_COMMON_AARCH64_MEM_NEON_H
+
+#include <arm_neon.h>
+#include <cassert>
+#include <stdint.h>
+
+// Load 4 bytes into the low half of a uint8x8_t, zero the upper half.
+static uint8x8_t inline load_u8x4x1(const uint8_t *s)
+{
+    uint8x8_t ret = vdup_n_u8(0);
+
+    ret = vreinterpret_u8_u32(vld1_lane_u32((const uint32_t*)s,
+                                            vreinterpret_u32_u8(ret), 0));
+    return ret;
+}
+
+static uint8x8_t inline load_u8x4x2(const uint8_t *s, intptr_t stride)
+{
+    uint8x8_t ret = vdup_n_u8(0);
+
+    ret = vreinterpret_u8_u32(vld1_lane_u32((const uint32_t*)s,
+                                            vreinterpret_u32_u8(ret), 0));
+    s += stride;
+    ret = vreinterpret_u8_u32(vld1_lane_u32((const uint32_t*)s,
+                                            vreinterpret_u32_u8(ret), 1));
+
+    return ret;
+}
+
+// Store 4 bytes from the low half of a uint8x8_t.
+static void inline store_u8x4x1(uint8_t *d, const uint8x8_t s)
+{
+    vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(s), 0);
+}
+
+// Store N blocks of 32-bits from (N / 2) D-Registers.
+template<int N>
+static void inline store_u8x4_strided_xN(uint8_t *d, intptr_t stride,
+                                         const uint8x8_t *s)
+{
+    assert(N % 2 == 0);
+    for (int i = 0; i < N / 2; ++i)
+    {
+        vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(si), 0);
+        d += stride;
+        vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(si), 1);
+        d += stride;
+    }
+}
+
+template<int N>
+static void inline load_u8x8xn(const uint8_t *src, const intptr_t stride,
+                               uint8x8_t *dst)
+{
+    for (int i = 0; i < N; ++i)
+    {
+        dsti = vld1_u8(src);
+        src += stride;
+    }
+}
+
+template<int N>
+static void inline load_u8x16xn(const uint8_t *src, const intptr_t stride,
+                                uint8x16_t *dst)
+{
+    for (int i = 0; i < N; ++i)
+    {
+        dsti = vld1q_u8(src);
+        src += stride;
+    }
+}
+
+template<int N>
+static void inline store_u8x2xn(uint8_t *dst, intptr_t dst_stride,
+                                const uint8x8_t *src)
+{
+    for (int i = 0; i < N; ++i)
+    {
+        vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(srci), 0);
+        dst += dst_stride;
+    }
+}
+
+template<int N>
+static void inline store_u8x4xn(uint8_t *dst, intptr_t dst_stride,
+                                const uint8x8_t *src)
+{
+    for (int i = 0; i < N; ++i)
+    {
+        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(srci), 0);
+        dst += dst_stride;
+    }
+}
+
+template<int N>
+static void inline store_u8x6xn(uint8_t *dst, intptr_t dst_stride,
+                                const uint8x8_t *src)
+{
+    for (int i = 0; i < N; ++i)
+    {
+        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(srci), 0);
+        vst1_lane_u16((uint16_t *)(dst + 4), vreinterpret_u16_u8(srci), 2);
+        dst += dst_stride;
+    }
+}
+
+template<int N>
+static void inline store_u8x8xn(uint8_t *dst, intptr_t dst_stride,
+                                const uint8x8_t *src)
+{
+    for (int i = 0; i < N; ++i)
+    {
+        vst1_u8(dst, srci);
+        dst += dst_stride;
+    }
+}
+
+template<int N, int M>
+static void inline store_u8xnxm(uint8_t *dst, intptr_t dst_stride,
+                                const uint8x8_t *src)
+{
+    switch (N)
+    {
+    case 2: return store_u8x2xn<M>(dst, dst_stride, src);
+    case 4: return store_u8x4xn<M>(dst, dst_stride, src);
+    case 6: return store_u8x6xn<M>(dst, dst_stride, src);
+    case 8: return store_u8x8xn<M>(dst, dst_stride, src);
+    }
+}
+
+template<int N>
+static void inline store_u8x16xn(uint8_t *dst, intptr_t dst_stride,
+                                 const uint8x16_t *src)
+{
+    for (int i = 0; i < N; ++i)
+    {
+        vst1q_u8(dst, srci);
+        dst += dst_stride;
+    }
+}
+
+template<int N>
+static void inline load_s16x4xn(const int16_t *src, const intptr_t stride,
+                                int16x4_t *dst)
+{
+    for (int i = 0; i < N; ++i)
+    {
+        dsti = vld1_s16(src);
+        src += stride;
+    }
+}
+
+template<int N>
+static void inline load_s16x8xn(const int16_t *src, const intptr_t stride,
+                                int16x8_t *dst)
+{
+    for (int i = 0; i < N; ++i)
+    {
+        dsti = vld1q_s16(src);
+        src += stride;
+    }
+}
+
+template<int N>
+static void inline store_s16x2xn(int16_t *dst, intptr_t dst_stride,
+                                 const int16x4_t *src)
+{
+    for (int i = 0; i < N; ++i)
+    {
+        vst1_lane_s32((int32_t*)dst, vreinterpret_s32_s16(srci), 0);
+        dst += dst_stride;
+    }
+}
+
+template<int N>

 
@@ -0,0 +1,268 @@
+/*****************************************************************************
+ * Copyright (C) 2024 MulticoreWare, Inc
+ *
+ * Authors: Hari Limaye <hari.limaye@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_COMMON_AARCH64_MEM_NEON_H
+#define X265_COMMON_AARCH64_MEM_NEON_H
+
+#include <arm_neon.h>
+#include <cassert>
+#include <stdint.h>
+
+// Load 4 bytes into the low half of a uint8x8_t, zero the upper half.
+static uint8x8_t inline load_u8x4x1(const uint8_t *s)
+{
+    uint8x8_t ret = vdup_n_u8(0);
+
+    ret = vreinterpret_u8_u32(vld1_lane_u32((const uint32_t*)s,
+                                            vreinterpret_u32_u8(ret), 0));
+    return ret;
+}
+
+static uint8x8_t inline load_u8x4x2(const uint8_t *s, intptr_t stride)
+{
+    uint8x8_t ret = vdup_n_u8(0);
+
+    ret = vreinterpret_u8_u32(vld1_lane_u32((const uint32_t*)s,
+                                            vreinterpret_u32_u8(ret), 0));
+    s += stride;
+    ret = vreinterpret_u8_u32(vld1_lane_u32((const uint32_t*)s,
+                                            vreinterpret_u32_u8(ret), 1));
+
+    return ret;
+}
+
+// Store 4 bytes from the low half of a uint8x8_t.
+static void inline store_u8x4x1(uint8_t *d, const uint8x8_t s)
+{
+    vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(s), 0);
+}
+
+// Store N blocks of 32-bits from (N / 2) D-Registers.
+template<int N>
+static void inline store_u8x4_strided_xN(uint8_t *d, intptr_t stride,
+                                         const uint8x8_t *s)
+{
+    assert(N % 2 == 0);
+    for (int i = 0; i < N / 2; ++i)
+    {
+        vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(si), 0);
+        d += stride;
+        vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(si), 1);
+        d += stride;
+    }
+}
+
+template<int N>
+static void inline load_u8x8xn(const uint8_t *src, const intptr_t stride,
+                               uint8x8_t *dst)
+{
+    for (int i = 0; i < N; ++i)
+    {
+        dsti = vld1_u8(src);
+        src += stride;
+    }
+}
+
+template<int N>
+static void inline load_u8x16xn(const uint8_t *src, const intptr_t stride,
+                                uint8x16_t *dst)
+{
+    for (int i = 0; i < N; ++i)
+    {
+        dsti = vld1q_u8(src);
+        src += stride;
+    }
+}
+
+template<int N>
+static void inline store_u8x2xn(uint8_t *dst, intptr_t dst_stride,
+                                const uint8x8_t *src)
+{
+    for (int i = 0; i < N; ++i)
+    {
+        vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(srci), 0);
+        dst += dst_stride;
+    }
+}
+
+template<int N>
+static void inline store_u8x4xn(uint8_t *dst, intptr_t dst_stride,
+                                const uint8x8_t *src)
+{
+    for (int i = 0; i < N; ++i)
+    {
+        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(srci), 0);
+        dst += dst_stride;
+    }
+}
+
+template<int N>
+static void inline store_u8x6xn(uint8_t *dst, intptr_t dst_stride,
+                                const uint8x8_t *src)
+{
+    for (int i = 0; i < N; ++i)
+    {
+        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(srci), 0);
+        vst1_lane_u16((uint16_t *)(dst + 4), vreinterpret_u16_u8(srci), 2);
+        dst += dst_stride;
+    }
+}
+
+template<int N>
+static void inline store_u8x8xn(uint8_t *dst, intptr_t dst_stride,
+                                const uint8x8_t *src)
+{
+    for (int i = 0; i < N; ++i)
+    {
+        vst1_u8(dst, srci);
+        dst += dst_stride;
+    }
+}
+
+template<int N, int M>
+static void inline store_u8xnxm(uint8_t *dst, intptr_t dst_stride,
+                                const uint8x8_t *src)
+{
+    switch (N)
+    {
+    case 2: return store_u8x2xn<M>(dst, dst_stride, src);
+    case 4: return store_u8x4xn<M>(dst, dst_stride, src);
+    case 6: return store_u8x6xn<M>(dst, dst_stride, src);
+    case 8: return store_u8x8xn<M>(dst, dst_stride, src);
+    }
+}
+
+template<int N>
+static void inline store_u8x16xn(uint8_t *dst, intptr_t dst_stride,
+                                 const uint8x16_t *src)
+{
+    for (int i = 0; i < N; ++i)
+    {
+        vst1q_u8(dst, srci);
+        dst += dst_stride;
+    }
+}
+
+template<int N>
+static void inline load_s16x4xn(const int16_t *src, const intptr_t stride,
+                                int16x4_t *dst)
+{
+    for (int i = 0; i < N; ++i)
+    {
+        dsti = vld1_s16(src);
+        src += stride;
+    }
+}
+
+template<int N>
+static void inline load_s16x8xn(const int16_t *src, const intptr_t stride,
+                                int16x8_t *dst)
+{
+    for (int i = 0; i < N; ++i)
+    {
+        dsti = vld1q_s16(src);
+        src += stride;
+    }
+}
+
+template<int N>
+static void inline store_s16x2xn(int16_t *dst, intptr_t dst_stride,
+                                 const int16x4_t *src)
+{
+    for (int i = 0; i < N; ++i)
+    {
+        vst1_lane_s32((int32_t*)dst, vreinterpret_s32_s16(srci), 0);
+        dst += dst_stride;
+    }
+}
+
+template<int N>
​

x265_4.0.tar.gz/source/common/aarch64/neon-sve-bridge.h Added

@@ -0,0 +1,67 @@
+/*****************************************************************************
+ * Copyright (C) 2024 MulticoreWare, Inc
+ *
+ * Authors: Hari Limaye <hari.limaye@arm.com>
+ *          Jonathan Wright <jonathan.wright@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_COMMON_AARCH64_NEON_SVE_BRIDGE_H
+#define X265_COMMON_AARCH64_NEON_SVE_BRIDGE_H
+
+#include <arm_neon.h>
+
+#if defined(HAVE_SVE) && HAVE_SVE_BRIDGE
+#include <arm_sve.h>
+#include <arm_neon_sve_bridge.h>
+
+/* We can access instructions that are exclusive to the SVE or SVE2 instruction
+ * sets from a predominantly Neon context by making use of the Neon-SVE bridge
+ * intrinsics to reinterpret Neon vectors as SVE vectors - with the high part of
+ * the SVE vector (if it's longer than 128 bits) being "don't care".
+ *
+ * While sub-optimal on machines that have SVE vector length > 128-bit - as the
+ * remainder of the vector is unused - this approach is still beneficial when
+ * compared to a Neon-only implementation. */
+
+static inline int32x4_t x265_vld1sh_s32(const int16_t *ptr)
+{
+    return svget_neonq_s32(svld1sh_s32(svptrue_pat_b32(SV_VL4), ptr));
+}
+
+static inline int64x2_t x265_sdotq_s16(int64x2_t acc, int16x8_t x, int16x8_t y)
+{
+    return svget_neonq_s64(svdot_s64(svset_neonq_s64(svundef_s64(), acc),
+                                     svset_neonq_s16(svundef_s16(), x),
+                                     svset_neonq_s16(svundef_s16(), y)));
+}
+
+static inline int8x16_t x265_sve_mask(const int x, const int endX,
+                                      const int8x16_t in)
+{
+    // Use predicate to shift "unused lanes" outside of range -2, 2
+    svbool_t svpred = svwhilelt_b8(x, endX);
+    svint8_t edge_type = svsel_s8(svpred, svset_neonq_s8(svundef_s8(), in),
+                                  svdup_n_s8(-3));
+    return svget_neonq_s8(edge_type);
+}
+
+#endif // defined(HAVE_SVE) && HAVE_SVE_BRIDGE
+
+#endif // X265_COMMON_AARCH64_NEON_SVE_BRIDGE_H

 
@@ -0,0 +1,67 @@
+/*****************************************************************************
+ * Copyright (C) 2024 MulticoreWare, Inc
+ *
+ * Authors: Hari Limaye <hari.limaye@arm.com>
+ *          Jonathan Wright <jonathan.wright@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_COMMON_AARCH64_NEON_SVE_BRIDGE_H
+#define X265_COMMON_AARCH64_NEON_SVE_BRIDGE_H
+
+#include <arm_neon.h>
+
+#if defined(HAVE_SVE) && HAVE_SVE_BRIDGE
+#include <arm_sve.h>
+#include <arm_neon_sve_bridge.h>
+
+/* We can access instructions that are exclusive to the SVE or SVE2 instruction
+ * sets from a predominantly Neon context by making use of the Neon-SVE bridge
+ * intrinsics to reinterpret Neon vectors as SVE vectors - with the high part of
+ * the SVE vector (if it's longer than 128 bits) being "don't care".
+ *
+ * While sub-optimal on machines that have SVE vector length > 128-bit - as the
+ * remainder of the vector is unused - this approach is still beneficial when
+ * compared to a Neon-only implementation. */
+
+static inline int32x4_t x265_vld1sh_s32(const int16_t *ptr)
+{
+    return svget_neonq_s32(svld1sh_s32(svptrue_pat_b32(SV_VL4), ptr));
+}
+
+static inline int64x2_t x265_sdotq_s16(int64x2_t acc, int16x8_t x, int16x8_t y)
+{
+    return svget_neonq_s64(svdot_s64(svset_neonq_s64(svundef_s64(), acc),
+                                     svset_neonq_s16(svundef_s16(), x),
+                                     svset_neonq_s16(svundef_s16(), y)));
+}
+
+static inline int8x16_t x265_sve_mask(const int x, const int endX,
+                                      const int8x16_t in)
+{
+    // Use predicate to shift "unused lanes" outside of range -2, 2
+    svbool_t svpred = svwhilelt_b8(x, endX);
+    svint8_t edge_type = svsel_s8(svpred, svset_neonq_s8(svundef_s8(), in),
+                                  svdup_n_s8(-3));
+    return svget_neonq_s8(edge_type);
+}
+
+#endif // defined(HAVE_SVE) && HAVE_SVE_BRIDGE
+
+#endif // X265_COMMON_AARCH64_NEON_SVE_BRIDGE_H
​

x265_3.6.tar.gz/source/common/aarch64/p2s-sve.S -> x265_4.0.tar.gz/source/common/aarch64/p2s-sve.S Changed

@@ -204,7 +204,7 @@
 #else
     p2s_start
     mov             x9, #\h
-.loop_filter_sve_P2S_32x\h:
+.Loop_filter_sve_P2S_32x\h:
     sub             x9, x9, #1
     ld1             {v0.16b-v1.16b}, x0, x1
     ushll           v22.8h, v0.8b,  #P2S_SHIFT
@@ -216,7 +216,7 @@
     add             v24.8h, v24.8h, v31.8h
     add             v25.8h, v25.8h, v31.8h
     st1             {v22.16b-v25.16b}, x2, x3
-    cbnz            x9, .loop_filter_sve_P2S_32x\h
+    cbnz            x9, .Loop_filter_sve_P2S_32x\h
     ret
 #endif
 endfunc
@@ -331,7 +331,7 @@
     p2s_start
     sub             x3, x3, #64
     mov             x9, #\h
-.loop_filter_sve_P2S_64x\h:
+.Loop_filter_sve_P2S_64x\h:
     sub             x9, x9, #1
     ld1             {v0.16b-v3.16b}, x0, x1
     ushll           v16.8h, v0.8b,  #P2S_SHIFT
@@ -352,7 +352,7 @@
     add             v23.8h, v23.8h, v31.8h
     st1             {v16.16b-v19.16b}, x2, #64
     st1             {v20.16b-v23.16b}, x2, x3
-    cbnz            x9, .loop_filter_sve_P2S_64x\h
+    cbnz            x9, .Loop_filter_sve_P2S_64x\h
     ret
 #endif
 endfunc
@@ -422,7 +422,7 @@
     p2s_start
     sub             x3, x3, #64
     mov             x9, #64
-.loop_filterP2S_sve_48x64:
+.Loop_filterP2S_sve_48x64:
     sub            x9, x9, #1
     ld1             {v0.16b-v2.16b}, x0, x1
     ushll           v16.8h, v0.8b,  #P2S_SHIFT
@@ -439,7 +439,7 @@
     add             v21.8h, v21.8h, v31.8h
     st1             {v16.16b-v19.16b}, x2, #64
     st1             {v20.16b-v21.16b}, x2, x3
-    cbnz            x9, .loop_filterP2S_sve_48x64
+    cbnz            x9, .Loop_filterP2S_sve_48x64
     ret
 #endif
 endfunc

 
@@ -204,7 +204,7 @@
 #else
     p2s_start
     mov             x9, #\h
-.loop_filter_sve_P2S_32x\h:
+.Loop_filter_sve_P2S_32x\h:
     sub             x9, x9, #1
     ld1             {v0.16b-v1.16b}, x0, x1
     ushll           v22.8h, v0.8b,  #P2S_SHIFT
@@ -216,7 +216,7 @@
     add             v24.8h, v24.8h, v31.8h
     add             v25.8h, v25.8h, v31.8h
     st1             {v22.16b-v25.16b}, x2, x3
-    cbnz            x9, .loop_filter_sve_P2S_32x\h
+    cbnz            x9, .Loop_filter_sve_P2S_32x\h
     ret
 #endif
 endfunc
@@ -331,7 +331,7 @@
     p2s_start
     sub             x3, x3, #64
     mov             x9, #\h
-.loop_filter_sve_P2S_64x\h:
+.Loop_filter_sve_P2S_64x\h:
     sub             x9, x9, #1
     ld1             {v0.16b-v3.16b}, x0, x1
     ushll           v16.8h, v0.8b,  #P2S_SHIFT
@@ -352,7 +352,7 @@
     add             v23.8h, v23.8h, v31.8h
     st1             {v16.16b-v19.16b}, x2, #64
     st1             {v20.16b-v23.16b}, x2, x3
-    cbnz            x9, .loop_filter_sve_P2S_64x\h
+    cbnz            x9, .Loop_filter_sve_P2S_64x\h
     ret
 #endif
 endfunc
@@ -422,7 +422,7 @@
     p2s_start
     sub             x3, x3, #64
     mov             x9, #64
-.loop_filterP2S_sve_48x64:
+.Loop_filterP2S_sve_48x64:
     sub            x9, x9, #1
     ld1             {v0.16b-v2.16b}, x0, x1
     ushll           v16.8h, v0.8b,  #P2S_SHIFT
@@ -439,7 +439,7 @@
     add             v21.8h, v21.8h, v31.8h
     st1             {v16.16b-v19.16b}, x2, #64
     st1             {v20.16b-v21.16b}, x2, x3
-    cbnz            x9, .loop_filterP2S_sve_48x64
+    cbnz            x9, .Loop_filterP2S_sve_48x64
     ret
 #endif
 endfunc
​

x265_3.6.tar.gz/source/common/aarch64/p2s.S -> x265_4.0.tar.gz/source/common/aarch64/p2s.S Changed

@@ -262,7 +262,7 @@
 function PFX(filterPixelToShort_32x\h\()_neon)
     p2s_start
     mov             x9, #\h
-.loop_filterP2S_32x\h:
+.Loop_filterP2S_32x\h:
     sub             x9, x9, #1
 #if HIGH_BIT_DEPTH
     ld1             {v0.16b-v3.16b}, x0, x1
@@ -282,7 +282,7 @@
     add             v24.8h, v24.8h, v31.8h
     add             v25.8h, v25.8h, v31.8h
     st1             {v22.16b-v25.16b}, x2, x3
-    cbnz            x9, .loop_filterP2S_32x\h
+    cbnz            x9, .Loop_filterP2S_32x\h
     ret
 endfunc
 .endm
@@ -302,7 +302,7 @@
 #endif
     sub             x3, x3, #64
     mov             x9, #\h
-.loop_filterP2S_64x\h:
+.Loop_filterP2S_64x\h:
     sub             x9, x9, #1
 #if HIGH_BIT_DEPTH
     ld1             {v0.16b-v3.16b}, x0, #64
@@ -336,7 +336,7 @@
     add             v23.8h, v23.8h, v31.8h
     st1             {v16.16b-v19.16b}, x2, #64
     st1             {v20.16b-v23.16b}, x2, x3
-    cbnz            x9, .loop_filterP2S_64x\h
+    cbnz            x9, .Loop_filterP2S_64x\h
     ret
 endfunc
 .endm
@@ -353,7 +353,7 @@
 #endif
     sub             x3, x3, #64
     mov             x9, #64
-.loop_filterP2S_48x64:
+.Loop_filterP2S_48x64:
     sub            x9, x9, #1
 #if HIGH_BIT_DEPTH
     ld1             {v0.16b-v3.16b}, x0, #64
@@ -381,6 +381,6 @@
     add             v21.8h, v21.8h, v31.8h
     st1             {v16.16b-v19.16b}, x2, #64
     st1             {v20.16b-v21.16b}, x2, x3
-    cbnz            x9, .loop_filterP2S_48x64
+    cbnz            x9, .Loop_filterP2S_48x64
     ret
 endfunc

 
@@ -262,7 +262,7 @@
 function PFX(filterPixelToShort_32x\h\()_neon)
     p2s_start
     mov             x9, #\h
-.loop_filterP2S_32x\h:
+.Loop_filterP2S_32x\h:
     sub             x9, x9, #1
 #if HIGH_BIT_DEPTH
     ld1             {v0.16b-v3.16b}, x0, x1
@@ -282,7 +282,7 @@
     add             v24.8h, v24.8h, v31.8h
     add             v25.8h, v25.8h, v31.8h
     st1             {v22.16b-v25.16b}, x2, x3
-    cbnz            x9, .loop_filterP2S_32x\h
+    cbnz            x9, .Loop_filterP2S_32x\h
     ret
 endfunc
 .endm
@@ -302,7 +302,7 @@
 #endif
     sub             x3, x3, #64
     mov             x9, #\h
-.loop_filterP2S_64x\h:
+.Loop_filterP2S_64x\h:
     sub             x9, x9, #1
 #if HIGH_BIT_DEPTH
     ld1             {v0.16b-v3.16b}, x0, #64
@@ -336,7 +336,7 @@
     add             v23.8h, v23.8h, v31.8h
     st1             {v16.16b-v19.16b}, x2, #64
     st1             {v20.16b-v23.16b}, x2, x3
-    cbnz            x9, .loop_filterP2S_64x\h
+    cbnz            x9, .Loop_filterP2S_64x\h
     ret
 endfunc
 .endm
@@ -353,7 +353,7 @@
 #endif
     sub             x3, x3, #64
     mov             x9, #64
-.loop_filterP2S_48x64:
+.Loop_filterP2S_48x64:
     sub            x9, x9, #1
 #if HIGH_BIT_DEPTH
     ld1             {v0.16b-v3.16b}, x0, #64
@@ -381,6 +381,6 @@
     add             v21.8h, v21.8h, v31.8h
     st1             {v16.16b-v19.16b}, x2, #64
     st1             {v20.16b-v21.16b}, x2, x3
-    cbnz            x9, .loop_filterP2S_48x64
+    cbnz            x9, .Loop_filterP2S_48x64
     ret
 endfunc
​

x265_3.6.tar.gz/source/common/aarch64/pixel-prim.cpp -> x265_4.0.tar.gz/source/common/aarch64/pixel-prim.cpp Changed

@@ -7,6 +7,8 @@
 #include "arm64-utils.h"
 #if HAVE_NEON
 
+#include "mem-neon.h"
+
 #include <arm_neon.h>
 
 using namespace X265_NS;
@@ -24,26 +26,32 @@
     sub = vsubq_s16(a, b);
 }
 
-static inline void transpose_8h(int16x8_t &t1, int16x8_t &t2, const int16x8_t s1, const int16x8_t s2)
+static inline void transpose_8h_8h(int16x8_t &t1, int16x8_t &t2,
+                                   const int16x8_t s1, const int16x8_t s2)
 {
     t1 = vtrn1q_s16(s1, s2);
     t2 = vtrn2q_s16(s1, s2);
 }
 
-static inline void transpose_4s(int16x8_t &t1, int16x8_t &t2, const int16x8_t s1, const int16x8_t s2)
+static inline void transpose_4s_8h(int16x8_t &t1, int16x8_t &t2,
+                                   const int16x8_t s1, const int16x8_t s2)
 {
-    t1 = vtrn1q_s32(s1, s2);
-    t2 = vtrn2q_s32(s1, s2);
+    int32x4_t tmp1 = vreinterpretq_s32_s16(s1);
+    int32x4_t tmp2 = vreinterpretq_s32_s16(s2);
+
+    t1 = vreinterpretq_s16_s32(vtrn1q_s32(tmp1, tmp2));
+    t2 = vreinterpretq_s16_s32(vtrn2q_s32(tmp1, tmp2));
 }
 
-#if (X265_DEPTH <= 10)
-static inline void transpose_2d(int16x8_t &t1, int16x8_t &t2, const int16x8_t s1, const int16x8_t s2)
+static inline void transpose_2d_8h(int16x8_t &t1, int16x8_t &t2,
+                                   const int16x8_t s1, const int16x8_t s2)
 {
-    t1 = vtrn1q_s64(s1, s2);
-    t2 = vtrn2q_s64(s1, s2);
-}
-#endif
+    int64x2_t tmp1 = vreinterpretq_s64_s16(s1);
+    int64x2_t tmp2 = vreinterpretq_s64_s16(s2);
 
+    t1 = vreinterpretq_s16_s64(vtrn1q_s64(tmp1, tmp2));
+    t2 = vreinterpretq_s16_s64(vtrn2q_s64(tmp1, tmp2));
+}
 
 static inline void SUMSUB_ABCD(int16x8_t &s1, int16x8_t &d1, int16x8_t &s2, int16x8_t &d2,
                                int16x8_t a, int16x8_t  b, int16x8_t  c, int16x8_t  d)
@@ -73,29 +81,25 @@
     SUMSUB_AB(v4 , v6 , v16, v18);
     SUMSUB_AB(v5 , v7 , v17, v19);
 
-    v0 = vtrn1q_s16(v4, v5);
-    v1 = vtrn2q_s16(v4, v5);
-    v2 = vtrn1q_s16(v6, v7);
-    v3 = vtrn2q_s16(v6, v7);
+    transpose_8h_8h(v0, v1, v4, v5);
+    transpose_8h_8h(v2, v3, v6, v7);
 
     SUMSUB_AB(v16, v17, v0,  v1);
     SUMSUB_AB(v18, v19, v2,  v3);
 
-    v0 = vtrn1q_s32(v16, v18);
-    v1 = vtrn2q_s32(v16, v18);
-    v2 = vtrn1q_s32(v17, v19);
-    v3 = vtrn2q_s32(v17, v19);
+    transpose_4s_8h(v0, v1, v16, v18);
+    transpose_4s_8h(v2, v3, v17, v19);
 
-    v0 = vabsq_s16(v0);
-    v1 = vabsq_s16(v1);
-    v2 = vabsq_s16(v2);
-    v3 = vabsq_s16(v3);
+    uint16x8_t abs0 = vreinterpretq_u16_s16(vabsq_s16(v0));
+    uint16x8_t abs1 = vreinterpretq_u16_s16(vabsq_s16(v1));
+    uint16x8_t abs2 = vreinterpretq_u16_s16(vabsq_s16(v2));
+    uint16x8_t abs3 = vreinterpretq_u16_s16(vabsq_s16(v3));
 
-    v0 = vmaxq_u16(v0, v1);
-    v1 = vmaxq_u16(v2, v3);
+    uint16x8_t max0 = vmaxq_u16(abs0, abs1);
+    uint16x8_t max1 = vmaxq_u16(abs2, abs3);
 
-    v0 = vaddq_u16(v0, v1);
-    return vaddlvq_u16(v0);
+    uint16x8_t sum = vaddq_u16(max0, max1);
+    return vaddlvq_u16(sum);
 }
 
 static inline int _satd_4x4_neon(int16x8_t v0, int16x8_t v1)
@@ -103,22 +107,19 @@
     int16x8_t v2, v3;
     SUMSUB_AB(v2,  v3,  v0,  v1);
 
-    v0 = vzip1q_s64(v2, v3);
-    v1 = vzip2q_s64(v2, v3);
+    transpose_2d_8h(v0, v1, v2, v3);
     SUMSUB_AB(v2,  v3,  v0,  v1);
 
-    v0 = vtrn1q_s16(v2, v3);
-    v1 = vtrn2q_s16(v2, v3);
+    transpose_8h_8h(v0, v1, v2, v3);
     SUMSUB_AB(v2,  v3,  v0,  v1);
 
-    v0 = vtrn1q_s32(v2, v3);
-    v1 = vtrn2q_s32(v2, v3);
+    transpose_4s_8h(v0, v1, v2, v3);
 
-    v0 = vabsq_s16(v0);
-    v1 = vabsq_s16(v1);
-    v0 = vmaxq_u16(v0, v1);
+    uint16x8_t abs0 = vreinterpretq_u16_s16(vabsq_s16(v0));
+    uint16x8_t abs1 = vreinterpretq_u16_s16(vabsq_s16(v1));
+    uint16x8_t max = vmaxq_u16(abs0, abs1);
 
-    return vaddlvq_s16(v0);
+    return vaddlvq_u16(max);
 }
 
 static void _satd_8x4v_8x8h_neon(int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, int16x8_t &v3, int16x8_t &v20,
@@ -131,44 +132,47 @@
 
     HADAMARD4_V(v20, v21, v22, v23, v0,  v1, v2, v3);
 
-    transpose_8h(v0,  v1,  v16, v17);
-    transpose_8h(v2,  v3,  v18, v19);
-    transpose_8h(v4,  v5,  v20, v21);
-    transpose_8h(v6,  v7,  v22, v23);
+    transpose_8h_8h(v0,  v1,  v16, v17);
+    transpose_8h_8h(v2,  v3,  v18, v19);
+    transpose_8h_8h(v4,  v5,  v20, v21);
+    transpose_8h_8h(v6,  v7,  v22, v23);
 
     SUMSUB_AB(v16, v17, v0,  v1);
     SUMSUB_AB(v18, v19, v2,  v3);
     SUMSUB_AB(v20, v21, v4,  v5);
     SUMSUB_AB(v22, v23, v6,  v7);
 
-    transpose_4s(v0,  v2,  v16, v18);
-    transpose_4s(v1,  v3,  v17, v19);
-    transpose_4s(v4,  v6,  v20, v22);
-    transpose_4s(v5,  v7,  v21, v23);
-
-    v0 = vabsq_s16(v0);
-    v1 = vabsq_s16(v1);
-    v2 = vabsq_s16(v2);
-    v3 = vabsq_s16(v3);
-    v4 = vabsq_s16(v4);
-    v5 = vabsq_s16(v5);
-    v6 = vabsq_s16(v6);
-    v7 = vabsq_s16(v7);
-
-    v0 = vmaxq_u16(v0, v2);
-    v1 = vmaxq_u16(v1, v3);
-    v2 = vmaxq_u16(v4, v6);
-    v3 = vmaxq_u16(v5, v7);
-
+    transpose_4s_8h(v0,  v2,  v16, v18);
+    transpose_4s_8h(v1,  v3,  v17, v19);
+    transpose_4s_8h(v4,  v6,  v20, v22);
+    transpose_4s_8h(v5,  v7,  v21, v23);
+
+    uint16x8_t abs0 = vreinterpretq_u16_s16(vabsq_s16(v0));
+    uint16x8_t abs1 = vreinterpretq_u16_s16(vabsq_s16(v1));
+    uint16x8_t abs2 = vreinterpretq_u16_s16(vabsq_s16(v2));
+    uint16x8_t abs3 = vreinterpretq_u16_s16(vabsq_s16(v3));
+    uint16x8_t abs4 = vreinterpretq_u16_s16(vabsq_s16(v4));
+    uint16x8_t abs5 = vreinterpretq_u16_s16(vabsq_s16(v5));
+    uint16x8_t abs6 = vreinterpretq_u16_s16(vabsq_s16(v6));
+    uint16x8_t abs7 = vreinterpretq_u16_s16(vabsq_s16(v7));
+
+    v0 = vreinterpretq_s16_u16(vmaxq_u16(abs0, abs2));
+    v1 = vreinterpretq_s16_u16(vmaxq_u16(abs1, abs3));
+    v2 = vreinterpretq_s16_u16(vmaxq_u16(abs4, abs6));
+    v3 = vreinterpretq_s16_u16(vmaxq_u16(abs5, abs7));
 }
 
 #if HIGH_BIT_DEPTH
 
 #if (X265_DEPTH > 10)
-static inline void transpose_2d(int32x4_t &t1, int32x4_t &t2, const int32x4_t s1, const int32x4_t s2)
+static inline void transpose_2d_4s(int32x4_t &t1, int32x4_t &t2,
+                                   const int32x4_t s1, const int32x4_t s2)
 {
-    t1 = vtrn1q_s64(s1, s2);
-    t2 = vtrn2q_s64(s1, s2);
+    int64x2_t tmp1 = vreinterpretq_s64_s32(s1);
+    int64x2_t tmp2 = vreinterpretq_s64_s32(s2);
+
+    t1 = vreinterpretq_s32_s64(vtrn1q_s64(tmp1, tmp2));
+    t2 = vreinterpretq_s32_s64(vtrn2q_s64(tmp1, tmp2));
 }
 
 static inline void ISUMSUB_AB(int32x4_t &sum, int32x4_t &sub, const int32x4_t a, const int32x4_t b)
@@ -197,35 +201,35 @@
     int16x8_t v16, v17;

 
@@ -7,6 +7,8 @@
 #include "arm64-utils.h"
 #if HAVE_NEON
 
+#include "mem-neon.h"
+
 #include <arm_neon.h>
 
 using namespace X265_NS;
@@ -24,26 +26,32 @@
     sub = vsubq_s16(a, b);
 }
 
-static inline void transpose_8h(int16x8_t &t1, int16x8_t &t2, const int16x8_t s1, const int16x8_t s2)
+static inline void transpose_8h_8h(int16x8_t &t1, int16x8_t &t2,
+                                   const int16x8_t s1, const int16x8_t s2)
 {
     t1 = vtrn1q_s16(s1, s2);
     t2 = vtrn2q_s16(s1, s2);
 }
 
-static inline void transpose_4s(int16x8_t &t1, int16x8_t &t2, const int16x8_t s1, const int16x8_t s2)
+static inline void transpose_4s_8h(int16x8_t &t1, int16x8_t &t2,
+                                   const int16x8_t s1, const int16x8_t s2)
 {
-    t1 = vtrn1q_s32(s1, s2);
-    t2 = vtrn2q_s32(s1, s2);
+    int32x4_t tmp1 = vreinterpretq_s32_s16(s1);
+    int32x4_t tmp2 = vreinterpretq_s32_s16(s2);
+
+    t1 = vreinterpretq_s16_s32(vtrn1q_s32(tmp1, tmp2));
+    t2 = vreinterpretq_s16_s32(vtrn2q_s32(tmp1, tmp2));
 }
 
-#if (X265_DEPTH <= 10)
-static inline void transpose_2d(int16x8_t &t1, int16x8_t &t2, const int16x8_t s1, const int16x8_t s2)
+static inline void transpose_2d_8h(int16x8_t &t1, int16x8_t &t2,
+                                   const int16x8_t s1, const int16x8_t s2)
 {
-    t1 = vtrn1q_s64(s1, s2);
-    t2 = vtrn2q_s64(s1, s2);
-}
-#endif
+    int64x2_t tmp1 = vreinterpretq_s64_s16(s1);
+    int64x2_t tmp2 = vreinterpretq_s64_s16(s2);
 
+    t1 = vreinterpretq_s16_s64(vtrn1q_s64(tmp1, tmp2));
+    t2 = vreinterpretq_s16_s64(vtrn2q_s64(tmp1, tmp2));
+}
 
 static inline void SUMSUB_ABCD(int16x8_t &s1, int16x8_t &d1, int16x8_t &s2, int16x8_t &d2,
                                int16x8_t a, int16x8_t  b, int16x8_t  c, int16x8_t  d)
@@ -73,29 +81,25 @@
     SUMSUB_AB(v4 , v6 , v16, v18);
     SUMSUB_AB(v5 , v7 , v17, v19);
 
-    v0 = vtrn1q_s16(v4, v5);
-    v1 = vtrn2q_s16(v4, v5);
-    v2 = vtrn1q_s16(v6, v7);
-    v3 = vtrn2q_s16(v6, v7);
+    transpose_8h_8h(v0, v1, v4, v5);
+    transpose_8h_8h(v2, v3, v6, v7);
 
     SUMSUB_AB(v16, v17, v0,  v1);
     SUMSUB_AB(v18, v19, v2,  v3);
 
-    v0 = vtrn1q_s32(v16, v18);
-    v1 = vtrn2q_s32(v16, v18);
-    v2 = vtrn1q_s32(v17, v19);
-    v3 = vtrn2q_s32(v17, v19);
+    transpose_4s_8h(v0, v1, v16, v18);
+    transpose_4s_8h(v2, v3, v17, v19);
 
-    v0 = vabsq_s16(v0);
-    v1 = vabsq_s16(v1);
-    v2 = vabsq_s16(v2);
-    v3 = vabsq_s16(v3);
+    uint16x8_t abs0 = vreinterpretq_u16_s16(vabsq_s16(v0));
+    uint16x8_t abs1 = vreinterpretq_u16_s16(vabsq_s16(v1));
+    uint16x8_t abs2 = vreinterpretq_u16_s16(vabsq_s16(v2));
+    uint16x8_t abs3 = vreinterpretq_u16_s16(vabsq_s16(v3));
 
-    v0 = vmaxq_u16(v0, v1);
-    v1 = vmaxq_u16(v2, v3);
+    uint16x8_t max0 = vmaxq_u16(abs0, abs1);
+    uint16x8_t max1 = vmaxq_u16(abs2, abs3);
 
-    v0 = vaddq_u16(v0, v1);
-    return vaddlvq_u16(v0);
+    uint16x8_t sum = vaddq_u16(max0, max1);
+    return vaddlvq_u16(sum);
 }
 
 static inline int _satd_4x4_neon(int16x8_t v0, int16x8_t v1)
@@ -103,22 +107,19 @@
     int16x8_t v2, v3;
     SUMSUB_AB(v2,  v3,  v0,  v1);
 
-    v0 = vzip1q_s64(v2, v3);
-    v1 = vzip2q_s64(v2, v3);
+    transpose_2d_8h(v0, v1, v2, v3);
     SUMSUB_AB(v2,  v3,  v0,  v1);
 
-    v0 = vtrn1q_s16(v2, v3);
-    v1 = vtrn2q_s16(v2, v3);
+    transpose_8h_8h(v0, v1, v2, v3);
     SUMSUB_AB(v2,  v3,  v0,  v1);
 
-    v0 = vtrn1q_s32(v2, v3);
-    v1 = vtrn2q_s32(v2, v3);
+    transpose_4s_8h(v0, v1, v2, v3);
 
-    v0 = vabsq_s16(v0);
-    v1 = vabsq_s16(v1);
-    v0 = vmaxq_u16(v0, v1);
+    uint16x8_t abs0 = vreinterpretq_u16_s16(vabsq_s16(v0));
+    uint16x8_t abs1 = vreinterpretq_u16_s16(vabsq_s16(v1));
+    uint16x8_t max = vmaxq_u16(abs0, abs1);
 
-    return vaddlvq_s16(v0);
+    return vaddlvq_u16(max);
 }
 
 static void _satd_8x4v_8x8h_neon(int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, int16x8_t &v3, int16x8_t &v20,
@@ -131,44 +132,47 @@
 
     HADAMARD4_V(v20, v21, v22, v23, v0,  v1, v2, v3);
 
-    transpose_8h(v0,  v1,  v16, v17);
-    transpose_8h(v2,  v3,  v18, v19);
-    transpose_8h(v4,  v5,  v20, v21);
-    transpose_8h(v6,  v7,  v22, v23);
+    transpose_8h_8h(v0,  v1,  v16, v17);
+    transpose_8h_8h(v2,  v3,  v18, v19);
+    transpose_8h_8h(v4,  v5,  v20, v21);
+    transpose_8h_8h(v6,  v7,  v22, v23);
 
     SUMSUB_AB(v16, v17, v0,  v1);
     SUMSUB_AB(v18, v19, v2,  v3);
     SUMSUB_AB(v20, v21, v4,  v5);
     SUMSUB_AB(v22, v23, v6,  v7);
 
-    transpose_4s(v0,  v2,  v16, v18);
-    transpose_4s(v1,  v3,  v17, v19);
-    transpose_4s(v4,  v6,  v20, v22);
-    transpose_4s(v5,  v7,  v21, v23);
-
-    v0 = vabsq_s16(v0);
-    v1 = vabsq_s16(v1);
-    v2 = vabsq_s16(v2);
-    v3 = vabsq_s16(v3);
-    v4 = vabsq_s16(v4);
-    v5 = vabsq_s16(v5);
-    v6 = vabsq_s16(v6);
-    v7 = vabsq_s16(v7);
-
-    v0 = vmaxq_u16(v0, v2);
-    v1 = vmaxq_u16(v1, v3);
-    v2 = vmaxq_u16(v4, v6);
-    v3 = vmaxq_u16(v5, v7);
-
+    transpose_4s_8h(v0,  v2,  v16, v18);
+    transpose_4s_8h(v1,  v3,  v17, v19);
+    transpose_4s_8h(v4,  v6,  v20, v22);
+    transpose_4s_8h(v5,  v7,  v21, v23);
+
+    uint16x8_t abs0 = vreinterpretq_u16_s16(vabsq_s16(v0));
+    uint16x8_t abs1 = vreinterpretq_u16_s16(vabsq_s16(v1));
+    uint16x8_t abs2 = vreinterpretq_u16_s16(vabsq_s16(v2));
+    uint16x8_t abs3 = vreinterpretq_u16_s16(vabsq_s16(v3));
+    uint16x8_t abs4 = vreinterpretq_u16_s16(vabsq_s16(v4));
+    uint16x8_t abs5 = vreinterpretq_u16_s16(vabsq_s16(v5));
+    uint16x8_t abs6 = vreinterpretq_u16_s16(vabsq_s16(v6));
+    uint16x8_t abs7 = vreinterpretq_u16_s16(vabsq_s16(v7));
+
+    v0 = vreinterpretq_s16_u16(vmaxq_u16(abs0, abs2));
+    v1 = vreinterpretq_s16_u16(vmaxq_u16(abs1, abs3));
+    v2 = vreinterpretq_s16_u16(vmaxq_u16(abs4, abs6));
+    v3 = vreinterpretq_s16_u16(vmaxq_u16(abs5, abs7));
 }
 
 #if HIGH_BIT_DEPTH
 
 #if (X265_DEPTH > 10)
-static inline void transpose_2d(int32x4_t &t1, int32x4_t &t2, const int32x4_t s1, const int32x4_t s2)
+static inline void transpose_2d_4s(int32x4_t &t1, int32x4_t &t2,
+                                   const int32x4_t s1, const int32x4_t s2)
 {
-    t1 = vtrn1q_s64(s1, s2);
-    t2 = vtrn2q_s64(s1, s2);
+    int64x2_t tmp1 = vreinterpretq_s64_s32(s1);
+    int64x2_t tmp2 = vreinterpretq_s64_s32(s2);
+
+    t1 = vreinterpretq_s32_s64(vtrn1q_s64(tmp1, tmp2));
+    t2 = vreinterpretq_s32_s64(vtrn2q_s64(tmp1, tmp2));
 }
 
 static inline void ISUMSUB_AB(int32x4_t &sum, int32x4_t &sub, const int32x4_t a, const int32x4_t b)
@@ -197,35 +201,35 @@
     int16x8_t v16, v17;
​

x265_3.6.tar.gz/source/common/aarch64/pixel-util-sve.S -> x265_4.0.tar.gz/source/common/aarch64/pixel-util-sve.S Changed

@@ -190,27 +190,27 @@
     ld1b            {z7.h}, p0/z, x2, x11
     add             x0, x0, x1
     add             x2, x2, x3
-    ld1b            {z29.h}, p0/z, x0
-    ld1b            {z9.h}, p0/z, x0, x11
-    ld1b            {z10.h}, p0/z, x2
-    ld1b            {z11.h}, p0/z, x2, x11
-    add             x0, x0, x1
-    add             x2, x2, x3
-    ld1b            {z12.h}, p0/z, x0
-    ld1b            {z13.h}, p0/z, x0, x11
-    ld1b            {z14.h}, p0/z, x2
-    ld1b            {z15.h}, p0/z, x2, x11
-    add             x0, x0, x1
-    add             x2, x2, x3
-
     sub             \v0\().h, z0.h, z2.h
     sub             \v4\().h, z1.h, z3.h
     sub             \v1\().h, z4.h, z6.h
     sub             \v5\().h, z5.h, z7.h
-    sub             \v2\().h, z29.h, z10.h
-    sub             \v6\().h, z9.h, z11.h
-    sub             \v3\().h, z12.h, z14.h
-    sub             \v7\().h, z13.h, z15.h
+
+    ld1b            {z0.h}, p0/z, x0
+    ld1b            {z1.h}, p0/z, x0, x11
+    ld1b            {z2.h}, p0/z, x2
+    ld1b            {z3.h}, p0/z, x2, x11
+    add             x0, x0, x1
+    add             x2, x2, x3
+    ld1b            {z4.h}, p0/z, x0
+    ld1b            {z5.h}, p0/z, x0, x11
+    ld1b            {z6.h}, p0/z, x2
+    ld1b            {z7.h}, p0/z, x2, x11
+    add             x0, x0, x1
+    add             x2, x2, x3
+    sub             \v2\().h, z0.h, z2.h
+    sub             \v6\().h, z1.h, z3.h
+    sub             \v3\().h, z4.h, z6.h
+    sub             \v7\().h, z5.h, z7.h
 .endm
 
 // one vertical hadamard pass and two horizontal
@@ -314,60 +314,3 @@
     mov             x0, x7
     ret             x10
 endfunc
-
-/********* ssim ***********/
-// uint32_t quant_c(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff)
-// No need to fully use sve instructions for this function
-function PFX(quant_sve)
-    mov             w9, #1
-    lsl             w9, w9, w4
-    mov             z0.s, w9
-    neg             w9, w4
-    mov             z1.s, w9
-    add             w9, w9, #8
-    mov             z2.s, w9
-    mov             z3.s, w5
-
-    lsr             w6, w6, #2
-    eor             z4.d, z4.d, z4.d
-    eor             w10, w10, w10
-    eor             z17.d, z17.d, z17.d
-
-.loop_quant_sve:
-    ld1             {v18.4h}, x0, #8
-    ld1             {v7.4s}, x1, #16
-    sxtl            v6.4s, v18.4h
-
-    cmlt            v5.4s, v6.4s, #0
-
-    abs             v6.4s, v6.4s
-
-
-    mul             v6.4s, v6.4s, v7.4s
-
-    add             v7.4s, v6.4s, v3.4s
-    sshl            v7.4s, v7.4s, v1.4s
-
-    mls             v6.4s, v7.4s, v0.s0
-    sshl            v16.4s, v6.4s, v2.4s
-    st1             {v16.4s}, x2, #16
-
-    // numsig
-    cmeq            v16.4s, v7.4s, v17.4s
-    add             v4.4s, v4.4s, v16.4s
-    add             w10, w10, #4
-
-    // level *= sign
-    eor             z16.d, z7.d, z5.d
-    sub             v16.4s, v16.4s, v5.4s
-    sqxtn           v5.4h, v16.4s
-    st1             {v5.4h}, x3, #8
-
-    subs            w6, w6, #1
-    b.ne             .loop_quant_sve
-
-    addv            s4, v4.4s
-    mov             w9, v4.s0
-    add             w0, w10, w9
-    ret
-endfunc

 
@@ -190,27 +190,27 @@
     ld1b            {z7.h}, p0/z, x2, x11
     add             x0, x0, x1
     add             x2, x2, x3
-    ld1b            {z29.h}, p0/z, x0
-    ld1b            {z9.h}, p0/z, x0, x11
-    ld1b            {z10.h}, p0/z, x2
-    ld1b            {z11.h}, p0/z, x2, x11
-    add             x0, x0, x1
-    add             x2, x2, x3
-    ld1b            {z12.h}, p0/z, x0
-    ld1b            {z13.h}, p0/z, x0, x11
-    ld1b            {z14.h}, p0/z, x2
-    ld1b            {z15.h}, p0/z, x2, x11
-    add             x0, x0, x1
-    add             x2, x2, x3
-
     sub             \v0\().h, z0.h, z2.h
     sub             \v4\().h, z1.h, z3.h
     sub             \v1\().h, z4.h, z6.h
     sub             \v5\().h, z5.h, z7.h
-    sub             \v2\().h, z29.h, z10.h
-    sub             \v6\().h, z9.h, z11.h
-    sub             \v3\().h, z12.h, z14.h
-    sub             \v7\().h, z13.h, z15.h
+
+    ld1b            {z0.h}, p0/z, x0
+    ld1b            {z1.h}, p0/z, x0, x11
+    ld1b            {z2.h}, p0/z, x2
+    ld1b            {z3.h}, p0/z, x2, x11
+    add             x0, x0, x1
+    add             x2, x2, x3
+    ld1b            {z4.h}, p0/z, x0
+    ld1b            {z5.h}, p0/z, x0, x11
+    ld1b            {z6.h}, p0/z, x2
+    ld1b            {z7.h}, p0/z, x2, x11
+    add             x0, x0, x1
+    add             x2, x2, x3
+    sub             \v2\().h, z0.h, z2.h
+    sub             \v6\().h, z1.h, z3.h
+    sub             \v3\().h, z4.h, z6.h
+    sub             \v7\().h, z5.h, z7.h
 .endm
 
 // one vertical hadamard pass and two horizontal
@@ -314,60 +314,3 @@
     mov             x0, x7
     ret             x10
 endfunc
-
-/********* ssim ***********/
-// uint32_t quant_c(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff)
-// No need to fully use sve instructions for this function
-function PFX(quant_sve)
-    mov             w9, #1
-    lsl             w9, w9, w4
-    mov             z0.s, w9
-    neg             w9, w4
-    mov             z1.s, w9
-    add             w9, w9, #8
-    mov             z2.s, w9
-    mov             z3.s, w5
-
-    lsr             w6, w6, #2
-    eor             z4.d, z4.d, z4.d
-    eor             w10, w10, w10
-    eor             z17.d, z17.d, z17.d
-
-.loop_quant_sve:
-    ld1             {v18.4h}, x0, #8
-    ld1             {v7.4s}, x1, #16
-    sxtl            v6.4s, v18.4h
-
-    cmlt            v5.4s, v6.4s, #0
-
-    abs             v6.4s, v6.4s
-
-
-    mul             v6.4s, v6.4s, v7.4s
-
-    add             v7.4s, v6.4s, v3.4s
-    sshl            v7.4s, v7.4s, v1.4s
-
-    mls             v6.4s, v7.4s, v0.s0
-    sshl            v16.4s, v6.4s, v2.4s
-    st1             {v16.4s}, x2, #16
-
-    // numsig
-    cmeq            v16.4s, v7.4s, v17.4s
-    add             v4.4s, v4.4s, v16.4s
-    add             w10, w10, #4
-
-    // level *= sign
-    eor             z16.d, z7.d, z5.d
-    sub             v16.4s, v16.4s, v5.4s
-    sqxtn           v5.4h, v16.4s
-    st1             {v5.4h}, x3, #8
-
-    subs            w6, w6, #1
-    b.ne             .loop_quant_sve
-
-    addv            s4, v4.4s
-    mov             w9, v4.s0
-    add             w0, w10, w9
-    ret
-endfunc
​

x265_3.6.tar.gz/source/common/aarch64/pixel-util-sve2.S -> x265_4.0.tar.gz/source/common/aarch64/pixel-util-sve2.S Changed

@@ -64,11 +64,11 @@
     bgt             .vl_gt_16_pixel_var_16x16
     pixel_var_start
     mov             w12, #16
-.loop_var_16_sve2:
+.Loop_var_16_sve2:
     sub             w12, w12, #1
     ld1             {v4.16b}, x0, x1
     pixel_var_1 v4
-    cbnz            w12, .loop_var_16_sve2
+    cbnz            w12, .Loop_var_16_sve2
     pixel_var_end
     ret
 .vl_gt_16_pixel_var_16x16:
@@ -95,12 +95,12 @@
     bgt             .vl_gt_16_pixel_var_32x32
     pixel_var_start
     mov             w12, #32
-.loop_var_32_sve2:
+.Loop_var_32_sve2:
     sub             w12, w12, #1
     ld1             {v4.16b-v5.16b}, x0, x1
     pixel_var_1 v4
     pixel_var_1 v5
-    cbnz            w12, .loop_var_32_sve2
+    cbnz            w12, .Loop_var_32_sve2
     pixel_var_end
     ret
 .vl_gt_16_pixel_var_32x32:
@@ -150,14 +150,14 @@
     bgt             .vl_gt_16_pixel_var_64x64
     pixel_var_start
     mov             w12, #64
-.loop_var_64_sve2:
+.Loop_var_64_sve2:
     sub             w12, w12, #1
     ld1             {v4.16b-v7.16b}, x0, x1
     pixel_var_1 v4
     pixel_var_1 v5
     pixel_var_1 v6
     pixel_var_1 v7
-    cbnz            w12, .loop_var_64_sve2
+    cbnz            w12, .Loop_var_64_sve2
     pixel_var_end
     ret
 .vl_gt_16_pixel_var_64x64:
@@ -268,7 +268,7 @@
     bgt             .vl_gt_16_getResidual32
     lsl             x4, x3, #1
     mov             w12, #4
-.loop_residual_32:
+.Loop_residual_32:
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.16b-v1.16b}, x0, x3
@@ -286,7 +286,7 @@
     st1             {v16.8h-v19.8h}, x2, x4
     st1             {v20.8h-v23.8h}, x2, x4
 .endr
-    cbnz            w12, .loop_residual_32
+    cbnz            w12, .Loop_residual_32
     ret
 .vl_gt_16_getResidual32:
     cmp             x9, #48
@@ -323,7 +323,7 @@
     bgt             .vl_gt_16_pixel_sub_ps_32x32
     lsl             x1, x1, #1
     mov             w12, #4
-.loop_sub_ps_32_sve2:
+.Loop_sub_ps_32_sve2:
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.16b-v1.16b}, x2, x4
@@ -341,7 +341,7 @@
     st1             {v16.8h-v19.8h}, x0, x1
     st1             {v20.8h-v23.8h}, x0, x1
 .endr
-    cbnz            w12, .loop_sub_ps_32_sve2
+    cbnz            w12, .Loop_sub_ps_32_sve2
     ret
 .vl_gt_16_pixel_sub_ps_32x32:
     cmp             x9, #48
@@ -387,7 +387,7 @@
     lsl             x1, x1, #1
     sub             x1, x1, #64
     mov             w12, #16
-.loop_sub_ps_64_sve2:
+.Loop_sub_ps_64_sve2:
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.16b-v3.16b}, x2, x4
@@ -403,7 +403,7 @@
     st1             {v16.8h-v19.8h}, x0, #64
     st1             {v20.8h-v23.8h}, x0, x1
 .endr
-    cbnz            w12, .loop_sub_ps_64_sve2
+    cbnz            w12, .Loop_sub_ps_64_sve2
     ret
 .vl_gt_16_pixel_sub_ps_64x64:
     rdvl            x9, #1
@@ -473,7 +473,7 @@
     bgt             .vl_gt_16_pixel_sub_ps_32x64
     lsl             x1, x1, #1
     mov             w12, #8
-.loop_sub_ps_32x64_sve2:
+.Loop_sub_ps_32x64_sve2:
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.16b-v1.16b}, x2, x4
@@ -491,7 +491,7 @@
     st1             {v16.8h-v19.8h}, x0, x1
     st1             {v20.8h-v23.8h}, x0, x1
 .endr
-    cbnz            w12, .loop_sub_ps_32x64_sve2
+    cbnz            w12, .Loop_sub_ps_32x64_sve2
     ret
 .vl_gt_16_pixel_sub_ps_32x64:
     cmp             x9, #48
@@ -609,7 +609,7 @@
     bgt             .vl_gt_16_pixel_add_ps_32x\h
     lsl             x5, x5, #1
     mov             w12, #\h / 4
-.loop_add_ps__sve2_32x\h\():
+.Loop_add_ps__sve2_32x\h\():
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.16b-v1.16b}, x2, x4
@@ -628,7 +628,7 @@
     sqxtun2         v5.16b, v27.8h
     st1             {v4.16b-v5.16b}, x0, x1
 .endr
-    cbnz            w12, .loop_add_ps__sve2_32x\h
+    cbnz            w12, .Loop_add_ps__sve2_32x\h
     ret
 .vl_gt_16_pixel_add_ps_32x\h\():
     cmp             x9, #48
@@ -1157,7 +1157,7 @@
     bgt             .vl_gt_16_ssimDist16
     ssimDist_start
     ptrue           p0.s, vl4
-.loop_ssimDist16_sve2:
+.Loop_ssimDist16_sve2:
     sub             w12, w12, #1
     ld1b            {z4.s}, p0/z, x0
     ld1b            {z5.s}, p0/z, x0, #1, mul vl
@@ -1171,7 +1171,7 @@
     add             x2, x2, x3
     ssimDist_1_sve2 z4, z5, z8, z9
     ssimDist_1_sve2 z6, z7, z10, z11
-    cbnz            w12, .loop_ssimDist16_sve2
+    cbnz            w12, .Loop_ssimDist16_sve2
     ssimDist_end
     ret
 .vl_gt_16_ssimDist16:
@@ -1217,7 +1217,7 @@
     bgt             .vl_gt_16_ssimDist32
     ssimDist_start
     ptrue           p0.s, vl4
-.loop_ssimDist32_sve2:
+.Loop_ssimDist32_sve2:
     sub             w12, w12, #1
     ld1b            {z2.s}, p0/z, x0
     ld1b            {z3.s}, p0/z, x0, #1, mul vl
@@ -1241,7 +1241,7 @@
     ssimDist_1_sve2 z4, z5, z12, z13
     ssimDist_1_sve2 z6, z7, z14, z15
     ssimDist_1_sve2 z8, z9, z30, z31
-    cbnz            w12, .loop_ssimDist32_sve2
+    cbnz            w12, .Loop_ssimDist32_sve2
     ssimDist_end
     ret
 .vl_gt_16_ssimDist32:
@@ -1309,7 +1309,7 @@
     bgt             .vl_gt_16_ssimDist64
     ssimDist_start
     ptrue           p0.s, vl4
-.loop_ssimDist64_sve2:
+.Loop_ssimDist64_sve2:
     sub             w12, w12, #1
     ld1b            {z2.s}, p0/z, x0
     ld1b            {z3.s}, p0/z, x0, #1, mul vl
@@ -1357,7 +1357,7 @@
     ssimDist_1_sve2 z8, z9, z29, z30
     add             x0, x0, x1
     add             x2, x2, x3
-    cbnz            w12, .loop_ssimDist64_sve2
+    cbnz            w12, .Loop_ssimDist64_sve2
     ssimDist_end
     ret
 .vl_gt_16_ssimDist64:
@@ -1482,7 +1482,7 @@
     bgt             .vl_gt_16_normFact16
     normFact_start
     ptrue           p0.s, vl4
-.loop_normFact16_sve2:
+.Loop_normFact16_sve2:
     sub             w12, w12, #1
     ld1b            {z4.s}, p0/z, x0
     ld1b            {z5.s}, p0/z, x0, #1, mul vl
@@ -1491,7 +1491,7 @@

 
@@ -64,11 +64,11 @@
     bgt             .vl_gt_16_pixel_var_16x16
     pixel_var_start
     mov             w12, #16
-.loop_var_16_sve2:
+.Loop_var_16_sve2:
     sub             w12, w12, #1
     ld1             {v4.16b}, x0, x1
     pixel_var_1 v4
-    cbnz            w12, .loop_var_16_sve2
+    cbnz            w12, .Loop_var_16_sve2
     pixel_var_end
     ret
 .vl_gt_16_pixel_var_16x16:
@@ -95,12 +95,12 @@
     bgt             .vl_gt_16_pixel_var_32x32
     pixel_var_start
     mov             w12, #32
-.loop_var_32_sve2:
+.Loop_var_32_sve2:
     sub             w12, w12, #1
     ld1             {v4.16b-v5.16b}, x0, x1
     pixel_var_1 v4
     pixel_var_1 v5
-    cbnz            w12, .loop_var_32_sve2
+    cbnz            w12, .Loop_var_32_sve2
     pixel_var_end
     ret
 .vl_gt_16_pixel_var_32x32:
@@ -150,14 +150,14 @@
     bgt             .vl_gt_16_pixel_var_64x64
     pixel_var_start
     mov             w12, #64
-.loop_var_64_sve2:
+.Loop_var_64_sve2:
     sub             w12, w12, #1
     ld1             {v4.16b-v7.16b}, x0, x1
     pixel_var_1 v4
     pixel_var_1 v5
     pixel_var_1 v6
     pixel_var_1 v7
-    cbnz            w12, .loop_var_64_sve2
+    cbnz            w12, .Loop_var_64_sve2
     pixel_var_end
     ret
 .vl_gt_16_pixel_var_64x64:
@@ -268,7 +268,7 @@
     bgt             .vl_gt_16_getResidual32
     lsl             x4, x3, #1
     mov             w12, #4
-.loop_residual_32:
+.Loop_residual_32:
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.16b-v1.16b}, x0, x3
@@ -286,7 +286,7 @@
     st1             {v16.8h-v19.8h}, x2, x4
     st1             {v20.8h-v23.8h}, x2, x4
 .endr
-    cbnz            w12, .loop_residual_32
+    cbnz            w12, .Loop_residual_32
     ret
 .vl_gt_16_getResidual32:
     cmp             x9, #48
@@ -323,7 +323,7 @@
     bgt             .vl_gt_16_pixel_sub_ps_32x32
     lsl             x1, x1, #1
     mov             w12, #4
-.loop_sub_ps_32_sve2:
+.Loop_sub_ps_32_sve2:
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.16b-v1.16b}, x2, x4
@@ -341,7 +341,7 @@
     st1             {v16.8h-v19.8h}, x0, x1
     st1             {v20.8h-v23.8h}, x0, x1
 .endr
-    cbnz            w12, .loop_sub_ps_32_sve2
+    cbnz            w12, .Loop_sub_ps_32_sve2
     ret
 .vl_gt_16_pixel_sub_ps_32x32:
     cmp             x9, #48
@@ -387,7 +387,7 @@
     lsl             x1, x1, #1
     sub             x1, x1, #64
     mov             w12, #16
-.loop_sub_ps_64_sve2:
+.Loop_sub_ps_64_sve2:
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.16b-v3.16b}, x2, x4
@@ -403,7 +403,7 @@
     st1             {v16.8h-v19.8h}, x0, #64
     st1             {v20.8h-v23.8h}, x0, x1
 .endr
-    cbnz            w12, .loop_sub_ps_64_sve2
+    cbnz            w12, .Loop_sub_ps_64_sve2
     ret
 .vl_gt_16_pixel_sub_ps_64x64:
     rdvl            x9, #1
@@ -473,7 +473,7 @@
     bgt             .vl_gt_16_pixel_sub_ps_32x64
     lsl             x1, x1, #1
     mov             w12, #8
-.loop_sub_ps_32x64_sve2:
+.Loop_sub_ps_32x64_sve2:
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.16b-v1.16b}, x2, x4
@@ -491,7 +491,7 @@
     st1             {v16.8h-v19.8h}, x0, x1
     st1             {v20.8h-v23.8h}, x0, x1
 .endr
-    cbnz            w12, .loop_sub_ps_32x64_sve2
+    cbnz            w12, .Loop_sub_ps_32x64_sve2
     ret
 .vl_gt_16_pixel_sub_ps_32x64:
     cmp             x9, #48
@@ -609,7 +609,7 @@
     bgt             .vl_gt_16_pixel_add_ps_32x\h
     lsl             x5, x5, #1
     mov             w12, #\h / 4
-.loop_add_ps__sve2_32x\h\():
+.Loop_add_ps__sve2_32x\h\():
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.16b-v1.16b}, x2, x4
@@ -628,7 +628,7 @@
     sqxtun2         v5.16b, v27.8h
     st1             {v4.16b-v5.16b}, x0, x1
 .endr
-    cbnz            w12, .loop_add_ps__sve2_32x\h
+    cbnz            w12, .Loop_add_ps__sve2_32x\h
     ret
 .vl_gt_16_pixel_add_ps_32x\h\():
     cmp             x9, #48
@@ -1157,7 +1157,7 @@
     bgt             .vl_gt_16_ssimDist16
     ssimDist_start
     ptrue           p0.s, vl4
-.loop_ssimDist16_sve2:
+.Loop_ssimDist16_sve2:
     sub             w12, w12, #1
     ld1b            {z4.s}, p0/z, x0
     ld1b            {z5.s}, p0/z, x0, #1, mul vl
@@ -1171,7 +1171,7 @@
     add             x2, x2, x3
     ssimDist_1_sve2 z4, z5, z8, z9
     ssimDist_1_sve2 z6, z7, z10, z11
-    cbnz            w12, .loop_ssimDist16_sve2
+    cbnz            w12, .Loop_ssimDist16_sve2
     ssimDist_end
     ret
 .vl_gt_16_ssimDist16:
@@ -1217,7 +1217,7 @@
     bgt             .vl_gt_16_ssimDist32
     ssimDist_start
     ptrue           p0.s, vl4
-.loop_ssimDist32_sve2:
+.Loop_ssimDist32_sve2:
     sub             w12, w12, #1
     ld1b            {z2.s}, p0/z, x0
     ld1b            {z3.s}, p0/z, x0, #1, mul vl
@@ -1241,7 +1241,7 @@
     ssimDist_1_sve2 z4, z5, z12, z13
     ssimDist_1_sve2 z6, z7, z14, z15
     ssimDist_1_sve2 z8, z9, z30, z31
-    cbnz            w12, .loop_ssimDist32_sve2
+    cbnz            w12, .Loop_ssimDist32_sve2
     ssimDist_end
     ret
 .vl_gt_16_ssimDist32:
@@ -1309,7 +1309,7 @@
     bgt             .vl_gt_16_ssimDist64
     ssimDist_start
     ptrue           p0.s, vl4
-.loop_ssimDist64_sve2:
+.Loop_ssimDist64_sve2:
     sub             w12, w12, #1
     ld1b            {z2.s}, p0/z, x0
     ld1b            {z3.s}, p0/z, x0, #1, mul vl
@@ -1357,7 +1357,7 @@
     ssimDist_1_sve2 z8, z9, z29, z30
     add             x0, x0, x1
     add             x2, x2, x3
-    cbnz            w12, .loop_ssimDist64_sve2
+    cbnz            w12, .Loop_ssimDist64_sve2
     ssimDist_end
     ret
 .vl_gt_16_ssimDist64:
@@ -1482,7 +1482,7 @@
     bgt             .vl_gt_16_normFact16
     normFact_start
     ptrue           p0.s, vl4
-.loop_normFact16_sve2:
+.Loop_normFact16_sve2:
     sub             w12, w12, #1
     ld1b            {z4.s}, p0/z, x0
     ld1b            {z5.s}, p0/z, x0, #1, mul vl
@@ -1491,7 +1491,7 @@
​

x265_3.6.tar.gz/source/common/aarch64/pixel-util.S -> x265_4.0.tar.gz/source/common/aarch64/pixel-util.S Changed

@@ -60,11 +60,11 @@
 function PFX(pixel_var_16x16_neon)
     pixel_var_start
     mov             w12, #16
-.loop_var_16:
+.Loop_var_16:
     sub             w12, w12, #1
     ld1             {v4.16b}, x0, x1
     pixel_var_1 v4
-    cbnz            w12, .loop_var_16
+    cbnz            w12, .Loop_var_16
     pixel_var_end
     ret
 endfunc
@@ -72,12 +72,12 @@
 function PFX(pixel_var_32x32_neon)
     pixel_var_start
     mov             w12, #32
-.loop_var_32:
+.Loop_var_32:
     sub             w12, w12, #1
     ld1             {v4.16b-v5.16b}, x0, x1
     pixel_var_1 v4
     pixel_var_1 v5
-    cbnz            w12, .loop_var_32
+    cbnz            w12, .Loop_var_32
     pixel_var_end
     ret
 endfunc
@@ -85,14 +85,14 @@
 function PFX(pixel_var_64x64_neon)
     pixel_var_start
     mov             w12, #64
-.loop_var_64:
+.Loop_var_64:
     sub             w12, w12, #1
     ld1             {v4.16b-v7.16b}, x0, x1
     pixel_var_1 v4
     pixel_var_1 v5
     pixel_var_1 v6
     pixel_var_1 v7
-    cbnz            w12, .loop_var_64
+    cbnz            w12, .Loop_var_64
     pixel_var_end
     ret
 endfunc
@@ -148,7 +148,7 @@
 function PFX(getResidual32_neon)
     lsl             x4, x3, #1
     mov             w12, #4
-.loop_residual_32:
+.Loop_residual_32:
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.16b-v1.16b}, x0, x3
@@ -166,7 +166,7 @@
     st1             {v16.8h-v19.8h}, x2, x4
     st1             {v20.8h-v23.8h}, x2, x4
 .endr
-    cbnz            w12, .loop_residual_32
+    cbnz            w12, .Loop_residual_32
     ret
 endfunc
 
@@ -221,7 +221,7 @@
 function PFX(pixel_sub_ps_32x32_neon)
     lsl             x1, x1, #1
     mov             w12, #4
-.loop_sub_ps_32:
+.Loop_sub_ps_32:
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.16b-v1.16b}, x2, x4
@@ -239,7 +239,7 @@
     st1             {v16.8h-v19.8h}, x0, x1
     st1             {v20.8h-v23.8h}, x0, x1
 .endr
-    cbnz            w12, .loop_sub_ps_32
+    cbnz            w12, .Loop_sub_ps_32
     ret
 endfunc
 
@@ -247,7 +247,7 @@
     lsl             x1, x1, #1
     sub             x1, x1, #64
     mov             w12, #16
-.loop_sub_ps_64:
+.Loop_sub_ps_64:
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.16b-v3.16b}, x2, x4
@@ -263,7 +263,7 @@
     st1             {v16.8h-v19.8h}, x0, #64
     st1             {v20.8h-v23.8h}, x0, x1
 .endr
-    cbnz            w12, .loop_sub_ps_64
+    cbnz            w12, .Loop_sub_ps_64
     ret
 endfunc
 
@@ -318,7 +318,7 @@
 function PFX(pixel_sub_ps_32x64_neon)
     lsl             x1, x1, #1
     mov             w12, #8
-.loop_sub_ps_32x64:
+.Loop_sub_ps_32x64:
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.16b-v1.16b}, x2, x4
@@ -336,7 +336,7 @@
     st1             {v16.8h-v19.8h}, x0, x1
     st1             {v20.8h-v23.8h}, x0, x1
 .endr
-    cbnz            w12, .loop_sub_ps_32x64
+    cbnz            w12, .Loop_sub_ps_32x64
     ret
 endfunc
 
@@ -383,7 +383,7 @@
 function PFX(pixel_add_ps_16x\h\()_neon)
     lsl             x5, x5, #1
     mov             w12, #\h / 8
-.loop_add_ps_16x\h\():
+.Loop_add_ps_16x\h\():
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.16b}, x2, x4
@@ -405,7 +405,7 @@
     st1             {v4.16b}, x0, x1
     st1             {v5.16b}, x0, x1
 .endr
-    cbnz            w12, .loop_add_ps_16x\h
+    cbnz            w12, .Loop_add_ps_16x\h
     ret
 endfunc
 .endm
@@ -417,7 +417,7 @@
  function PFX(pixel_add_ps_32x\h\()_neon)
     lsl             x5, x5, #1
     mov             w12, #\h / 4
-.loop_add_ps_32x\h\():
+.Loop_add_ps_32x\h\():
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.16b-v1.16b}, x2, x4
@@ -436,7 +436,7 @@
     sqxtun2         v5.16b, v27.8h
     st1             {v4.16b-v5.16b}, x0, x1
 .endr
-    cbnz            w12, .loop_add_ps_32x\h
+    cbnz            w12, .Loop_add_ps_32x\h
     ret
 endfunc
 .endm
@@ -448,7 +448,7 @@
     lsl             x5, x5, #1
     sub             x5, x5, #64
     mov             w12, #32
-.loop_add_ps_64x64:
+.Loop_add_ps_64x64:
     sub             w12, w12, #1
 .rept 2
     ld1             {v0.16b-v3.16b}, x2, x4
@@ -480,7 +480,7 @@
     sqxtun2         v3.16b, v7.8h
     st1             {v0.16b-v3.16b}, x0, x1
 .endr
-    cbnz            w12, .loop_add_ps_64x64
+    cbnz            w12, .Loop_add_ps_64x64
     ret
 endfunc
 
@@ -548,7 +548,7 @@
 // void scale2D_64to32(pixel* dst, const pixel* src, intptr_t stride)
 function PFX(scale2D_64to32_neon)
     mov             w12, #32
-.loop_scale2D:
+.Loop_scale2D:
     ld1             {v0.16b-v3.16b}, x1, x2
     sub             w12, w12, #1
     ld1             {v4.16b-v7.16b}, x1, x2
@@ -561,7 +561,7 @@
     uqrshrn         v1.8b, v2.8h, #2
     uqrshrn2        v1.16b, v3.8h, #2
     st1             {v0.16b-v1.16b}, x0, #32
-    cbnz            w12, .loop_scale2D
+    cbnz            w12, .Loop_scale2D
     ret
 endfunc
 
@@ -569,33 +569,33 @@
 function PFX(pixel_planecopy_cp_neon)
     dup             v2.16b, w6
     sub             x5, x5, #1
-.loop_h:
+.Loop_h:
     mov             x6, x0
     mov             x12, x2
     mov             x7, #0
-.loop_w:

 
@@ -60,11 +60,11 @@
 function PFX(pixel_var_16x16_neon)
     pixel_var_start
     mov             w12, #16
-.loop_var_16:
+.Loop_var_16:
     sub             w12, w12, #1
     ld1             {v4.16b}, x0, x1
     pixel_var_1 v4
-    cbnz            w12, .loop_var_16
+    cbnz            w12, .Loop_var_16
     pixel_var_end
     ret
 endfunc
@@ -72,12 +72,12 @@
 function PFX(pixel_var_32x32_neon)
     pixel_var_start
     mov             w12, #32
-.loop_var_32:
+.Loop_var_32:
     sub             w12, w12, #1
     ld1             {v4.16b-v5.16b}, x0, x1
     pixel_var_1 v4
     pixel_var_1 v5
-    cbnz            w12, .loop_var_32
+    cbnz            w12, .Loop_var_32
     pixel_var_end
     ret
 endfunc
@@ -85,14 +85,14 @@
 function PFX(pixel_var_64x64_neon)
     pixel_var_start
     mov             w12, #64
-.loop_var_64:
+.Loop_var_64:
     sub             w12, w12, #1
     ld1             {v4.16b-v7.16b}, x0, x1
     pixel_var_1 v4
     pixel_var_1 v5
     pixel_var_1 v6
     pixel_var_1 v7
-    cbnz            w12, .loop_var_64
+    cbnz            w12, .Loop_var_64
     pixel_var_end
     ret
 endfunc
@@ -148,7 +148,7 @@
 function PFX(getResidual32_neon)
     lsl             x4, x3, #1
     mov             w12, #4
-.loop_residual_32:
+.Loop_residual_32:
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.16b-v1.16b}, x0, x3
@@ -166,7 +166,7 @@
     st1             {v16.8h-v19.8h}, x2, x4
     st1             {v20.8h-v23.8h}, x2, x4
 .endr
-    cbnz            w12, .loop_residual_32
+    cbnz            w12, .Loop_residual_32
     ret
 endfunc
 
@@ -221,7 +221,7 @@
 function PFX(pixel_sub_ps_32x32_neon)
     lsl             x1, x1, #1
     mov             w12, #4
-.loop_sub_ps_32:
+.Loop_sub_ps_32:
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.16b-v1.16b}, x2, x4
@@ -239,7 +239,7 @@
     st1             {v16.8h-v19.8h}, x0, x1
     st1             {v20.8h-v23.8h}, x0, x1
 .endr
-    cbnz            w12, .loop_sub_ps_32
+    cbnz            w12, .Loop_sub_ps_32
     ret
 endfunc
 
@@ -247,7 +247,7 @@
     lsl             x1, x1, #1
     sub             x1, x1, #64
     mov             w12, #16
-.loop_sub_ps_64:
+.Loop_sub_ps_64:
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.16b-v3.16b}, x2, x4
@@ -263,7 +263,7 @@
     st1             {v16.8h-v19.8h}, x0, #64
     st1             {v20.8h-v23.8h}, x0, x1
 .endr
-    cbnz            w12, .loop_sub_ps_64
+    cbnz            w12, .Loop_sub_ps_64
     ret
 endfunc
 
@@ -318,7 +318,7 @@
 function PFX(pixel_sub_ps_32x64_neon)
     lsl             x1, x1, #1
     mov             w12, #8
-.loop_sub_ps_32x64:
+.Loop_sub_ps_32x64:
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.16b-v1.16b}, x2, x4
@@ -336,7 +336,7 @@
     st1             {v16.8h-v19.8h}, x0, x1
     st1             {v20.8h-v23.8h}, x0, x1
 .endr
-    cbnz            w12, .loop_sub_ps_32x64
+    cbnz            w12, .Loop_sub_ps_32x64
     ret
 endfunc
 
@@ -383,7 +383,7 @@
 function PFX(pixel_add_ps_16x\h\()_neon)
     lsl             x5, x5, #1
     mov             w12, #\h / 8
-.loop_add_ps_16x\h\():
+.Loop_add_ps_16x\h\():
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.16b}, x2, x4
@@ -405,7 +405,7 @@
     st1             {v4.16b}, x0, x1
     st1             {v5.16b}, x0, x1
 .endr
-    cbnz            w12, .loop_add_ps_16x\h
+    cbnz            w12, .Loop_add_ps_16x\h
     ret
 endfunc
 .endm
@@ -417,7 +417,7 @@
  function PFX(pixel_add_ps_32x\h\()_neon)
     lsl             x5, x5, #1
     mov             w12, #\h / 4
-.loop_add_ps_32x\h\():
+.Loop_add_ps_32x\h\():
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.16b-v1.16b}, x2, x4
@@ -436,7 +436,7 @@
     sqxtun2         v5.16b, v27.8h
     st1             {v4.16b-v5.16b}, x0, x1
 .endr
-    cbnz            w12, .loop_add_ps_32x\h
+    cbnz            w12, .Loop_add_ps_32x\h
     ret
 endfunc
 .endm
@@ -448,7 +448,7 @@
     lsl             x5, x5, #1
     sub             x5, x5, #64
     mov             w12, #32
-.loop_add_ps_64x64:
+.Loop_add_ps_64x64:
     sub             w12, w12, #1
 .rept 2
     ld1             {v0.16b-v3.16b}, x2, x4
@@ -480,7 +480,7 @@
     sqxtun2         v3.16b, v7.8h
     st1             {v0.16b-v3.16b}, x0, x1
 .endr
-    cbnz            w12, .loop_add_ps_64x64
+    cbnz            w12, .Loop_add_ps_64x64
     ret
 endfunc
 
@@ -548,7 +548,7 @@
 // void scale2D_64to32(pixel* dst, const pixel* src, intptr_t stride)
 function PFX(scale2D_64to32_neon)
     mov             w12, #32
-.loop_scale2D:
+.Loop_scale2D:
     ld1             {v0.16b-v3.16b}, x1, x2
     sub             w12, w12, #1
     ld1             {v4.16b-v7.16b}, x1, x2
@@ -561,7 +561,7 @@
     uqrshrn         v1.8b, v2.8h, #2
     uqrshrn2        v1.16b, v3.8h, #2
     st1             {v0.16b-v1.16b}, x0, #32
-    cbnz            w12, .loop_scale2D
+    cbnz            w12, .Loop_scale2D
     ret
 endfunc
 
@@ -569,33 +569,33 @@
 function PFX(pixel_planecopy_cp_neon)
     dup             v2.16b, w6
     sub             x5, x5, #1
-.loop_h:
+.Loop_h:
     mov             x6, x0
     mov             x12, x2
     mov             x7, #0
-.loop_w:
​

x265_3.6.tar.gz/source/common/aarch64/sad-a.S -> x265_4.0.tar.gz/source/common/aarch64/sad-a.S Changed

@@ -1,8 +1,9 @@
 /*****************************************************************************
- * Copyright (C) 2020-2021 MulticoreWare, Inc
+ * Copyright (C) 2020-2024 MulticoreWare, Inc
  *
  * Authors: Hongbin Liu <liuhongbin1@huawei.com>
  *          Sebastian Pop <spop@amazon.com>
+            Hari Limaye <hari.limaye@arm.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -23,7 +24,6 @@
  *****************************************************************************/
 
 #include "asm.S"
-#include "sad-a-common.S"
 
 #ifdef __APPLE__
 .section __RODATA,__rodata
@@ -35,12 +35,234 @@
 
 .text
 
+.macro SAD_START_4 f
+    ldr             s0, x0
+    ldr             s1, x2
+    add             x0, x0, x1
+    add             x2, x2, x3
+    ld1             {v0.s}1, x0, x1
+    ld1             {v1.s}1, x2, x3
+    \f              v16.8h, v0.8b, v1.8b
+.endm
+
+.macro SAD_4 h
+.rept \h / 2 - 1
+    SAD_START_4 uabal
+.endr
+.endm
+
+.macro SAD_START_8 f
+    ld1             {v0.8b}, x0, x1
+    ld1             {v1.8b}, x2, x3
+    \f              v16.8h, v0.8b, v1.8b
+.endm
+
+.macro SAD_8 h
+.rept \h - 3
+    SAD_START_8 uabal
+.endr
+    ldr             d0, x0
+    ldr             d1, x2
+    uabal           v16.8h, v0.8b, v1.8b
+    ldr             d0, x0, x1
+    ldr             d1, x2, x3
+    uabal           v16.8h, v0.8b, v1.8b
+.endm
+
+.macro SAD_START_16
+    movi            v16.16b, #0
+    movi            v17.16b, #0
+.endm
+
+.macro SAD_16
+    ld1             {v0.16b}, x0, x1
+    ld1             {v1.16b}, x2, x3
+    ld1             {v2.16b}, x0, x1
+    ld1             {v3.16b}, x2, x3
+    uabd            v20.16b, v0.16b, v1.16b
+    uadalp          v16.8h, v20.16b
+    uabd            v21.16b, v2.16b, v3.16b
+    uadalp          v17.8h, v21.16b
+.endm
+
+.macro SAD_END_16
+    add             v16.8h, v16.8h, v17.8h
+    uaddlv          s0, v16.8h
+    fmov            x0, d0
+    ret
+.endm
+
+.macro SAD_START_32
+    movi            v16.16b, #0
+    movi            v17.16b, #0
+    movi            v18.16b, #0
+    movi            v19.16b, #0
+.endm
+
+.macro SAD_32
+    ld1             {v0.16b-v1.16b}, x0, x1
+    ld1             {v2.16b-v3.16b}, x2, x3
+    ld1             {v4.16b-v5.16b}, x0, x1
+    ld1             {v6.16b-v7.16b}, x2, x3
+    uabd            v20.16b, v0.16b, v2.16b
+    uadalp          v16.8h, v20.16b
+    uabd            v21.16b, v1.16b, v3.16b
+    uadalp          v17.8h, v21.16b
+    uabd            v22.16b, v4.16b, v6.16b
+    uadalp          v18.8h, v22.16b
+    uabd            v23.16b, v5.16b, v7.16b
+    uadalp          v19.8h, v23.16b
+.endm
+
+.macro SAD_END_32
+    add             v16.8h, v16.8h, v17.8h
+    add             v17.8h, v18.8h, v19.8h
+    add             v16.8h, v16.8h, v17.8h
+    uaddlv          s0, v16.8h
+    fmov            w0, s0
+    ret
+.endm
+
+.macro SAD_START_64
+    movi            v16.16b, #0
+    movi            v17.16b, #0
+    movi            v18.16b, #0
+    movi            v19.16b, #0
+.endm
+
+.macro SAD_64
+    ld1             {v0.16b-v3.16b}, x0, x1
+    ld1             {v4.16b-v7.16b}, x2, x3
+    ld1             {v24.16b-v27.16b}, x0, x1
+    ld1             {v28.16b-v31.16b}, x2, x3
+    uabd            v20.16b, v0.16b, v4.16b
+    uadalp          v16.8h, v20.16b
+    uabd            v21.16b, v1.16b, v5.16b
+    uadalp          v17.8h, v21.16b
+    uabd            v22.16b, v2.16b, v6.16b
+    uadalp          v18.8h, v22.16b
+    uabd            v23.16b, v3.16b, v7.16b
+    uadalp          v19.8h, v23.16b
+    uabd            v20.16b, v24.16b, v28.16b
+    uadalp          v16.8h, v20.16b
+    uabd            v21.16b, v25.16b, v29.16b
+    uadalp          v17.8h, v21.16b
+    uabd            v22.16b, v26.16b, v30.16b
+    uadalp          v18.8h, v22.16b
+    uabd            v23.16b, v27.16b, v31.16b
+    uadalp          v19.8h, v23.16b
+.endm
+
+.macro SAD_END_64
+    uaddlp          v16.4s, v16.8h
+    uadalp          v16.4s, v17.8h
+    uadalp          v16.4s, v18.8h
+    uadalp          v16.4s, v19.8h
+    uaddlv          d0, v16.4s
+    fmov            x0, d0
+    ret
+.endm
+
+.macro SAD_START_12
+    movrel          x12, sad12_mask
+    ld1             {v31.16b}, x12
+    movi            v16.16b, #0
+    movi            v17.16b, #0
+.endm
+
+.macro SAD_12
+    ld1             {v0.16b}, x0, x1
+    and             v0.16b, v0.16b, v31.16b
+    ld1             {v1.16b}, x2, x3
+    and             v1.16b, v1.16b, v31.16b
+    ld1             {v2.16b}, x0, x1
+    and             v2.16b, v2.16b, v31.16b
+    ld1             {v3.16b}, x2, x3
+    and             v3.16b, v3.16b, v31.16b
+    uabd            v20.16b, v0.16b, v1.16b
+    uadalp          v16.8h, v20.16b
+    uabd            v21.16b, v2.16b, v3.16b
+    uadalp          v17.8h, v21.16b
+.endm
+
+.macro SAD_END_12
+    add             v16.8h, v16.8h, v17.8h
+    uaddlv          s0, v16.8h
+    fmov            w0, s0
+    ret
+.endm
+
+.macro SAD_START_24
+    movi            v16.16b, #0
+    movi            v17.16b, #0
+    sub             x1, x1, #16
+    sub             x3, x3, #16
+.endm
+
+.macro SAD_24
+    ld1             {v0.16b}, x0, #16
+    ld1             {v1.8b}, x0, x1
+    ld1             {v2.16b}, x2, #16
+    ld1             {v3.8b}, x2, x3
+    ld1             {v4.16b}, x0, #16
+    ld1             {v5.8b}, x0, x1
+    ld1             {v6.16b}, x2, #16
+    ld1             {v7.8b}, x2, x3
+    uabd            v20.16b, v0.16b, v2.16b
+    uadalp          v16.8h, v20.16b
+    uabal           v17.8h, v1.8b, v3.8b
+    uabd            v20.16b, v4.16b, v6.16b

 
@@ -1,8 +1,9 @@
 /*****************************************************************************
- * Copyright (C) 2020-2021 MulticoreWare, Inc
+ * Copyright (C) 2020-2024 MulticoreWare, Inc
  *
  * Authors: Hongbin Liu <liuhongbin1@huawei.com>
  *          Sebastian Pop <spop@amazon.com>
+            Hari Limaye <hari.limaye@arm.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -23,7 +24,6 @@
  *****************************************************************************/
 
 #include "asm.S"
-#include "sad-a-common.S"
 
 #ifdef __APPLE__
 .section __RODATA,__rodata
@@ -35,12 +35,234 @@
 
 .text
 
+.macro SAD_START_4 f
+    ldr             s0, x0
+    ldr             s1, x2
+    add             x0, x0, x1
+    add             x2, x2, x3
+    ld1             {v0.s}1, x0, x1
+    ld1             {v1.s}1, x2, x3
+    \f              v16.8h, v0.8b, v1.8b
+.endm
+
+.macro SAD_4 h
+.rept \h / 2 - 1
+    SAD_START_4 uabal
+.endr
+.endm
+
+.macro SAD_START_8 f
+    ld1             {v0.8b}, x0, x1
+    ld1             {v1.8b}, x2, x3
+    \f              v16.8h, v0.8b, v1.8b
+.endm
+
+.macro SAD_8 h
+.rept \h - 3
+    SAD_START_8 uabal
+.endr
+    ldr             d0, x0
+    ldr             d1, x2
+    uabal           v16.8h, v0.8b, v1.8b
+    ldr             d0, x0, x1
+    ldr             d1, x2, x3
+    uabal           v16.8h, v0.8b, v1.8b
+.endm
+
+.macro SAD_START_16
+    movi            v16.16b, #0
+    movi            v17.16b, #0
+.endm
+
+.macro SAD_16
+    ld1             {v0.16b}, x0, x1
+    ld1             {v1.16b}, x2, x3
+    ld1             {v2.16b}, x0, x1
+    ld1             {v3.16b}, x2, x3
+    uabd            v20.16b, v0.16b, v1.16b
+    uadalp          v16.8h, v20.16b
+    uabd            v21.16b, v2.16b, v3.16b
+    uadalp          v17.8h, v21.16b
+.endm
+
+.macro SAD_END_16
+    add             v16.8h, v16.8h, v17.8h
+    uaddlv          s0, v16.8h
+    fmov            x0, d0
+    ret
+.endm
+
+.macro SAD_START_32
+    movi            v16.16b, #0
+    movi            v17.16b, #0
+    movi            v18.16b, #0
+    movi            v19.16b, #0
+.endm
+
+.macro SAD_32
+    ld1             {v0.16b-v1.16b}, x0, x1
+    ld1             {v2.16b-v3.16b}, x2, x3
+    ld1             {v4.16b-v5.16b}, x0, x1
+    ld1             {v6.16b-v7.16b}, x2, x3
+    uabd            v20.16b, v0.16b, v2.16b
+    uadalp          v16.8h, v20.16b
+    uabd            v21.16b, v1.16b, v3.16b
+    uadalp          v17.8h, v21.16b
+    uabd            v22.16b, v4.16b, v6.16b
+    uadalp          v18.8h, v22.16b
+    uabd            v23.16b, v5.16b, v7.16b
+    uadalp          v19.8h, v23.16b
+.endm
+
+.macro SAD_END_32
+    add             v16.8h, v16.8h, v17.8h
+    add             v17.8h, v18.8h, v19.8h
+    add             v16.8h, v16.8h, v17.8h
+    uaddlv          s0, v16.8h
+    fmov            w0, s0
+    ret
+.endm
+
+.macro SAD_START_64
+    movi            v16.16b, #0
+    movi            v17.16b, #0
+    movi            v18.16b, #0
+    movi            v19.16b, #0
+.endm
+
+.macro SAD_64
+    ld1             {v0.16b-v3.16b}, x0, x1
+    ld1             {v4.16b-v7.16b}, x2, x3
+    ld1             {v24.16b-v27.16b}, x0, x1
+    ld1             {v28.16b-v31.16b}, x2, x3
+    uabd            v20.16b, v0.16b, v4.16b
+    uadalp          v16.8h, v20.16b
+    uabd            v21.16b, v1.16b, v5.16b
+    uadalp          v17.8h, v21.16b
+    uabd            v22.16b, v2.16b, v6.16b
+    uadalp          v18.8h, v22.16b
+    uabd            v23.16b, v3.16b, v7.16b
+    uadalp          v19.8h, v23.16b
+    uabd            v20.16b, v24.16b, v28.16b
+    uadalp          v16.8h, v20.16b
+    uabd            v21.16b, v25.16b, v29.16b
+    uadalp          v17.8h, v21.16b
+    uabd            v22.16b, v26.16b, v30.16b
+    uadalp          v18.8h, v22.16b
+    uabd            v23.16b, v27.16b, v31.16b
+    uadalp          v19.8h, v23.16b
+.endm
+
+.macro SAD_END_64
+    uaddlp          v16.4s, v16.8h
+    uadalp          v16.4s, v17.8h
+    uadalp          v16.4s, v18.8h
+    uadalp          v16.4s, v19.8h
+    uaddlv          d0, v16.4s
+    fmov            x0, d0
+    ret
+.endm
+
+.macro SAD_START_12
+    movrel          x12, sad12_mask
+    ld1             {v31.16b}, x12
+    movi            v16.16b, #0
+    movi            v17.16b, #0
+.endm
+
+.macro SAD_12
+    ld1             {v0.16b}, x0, x1
+    and             v0.16b, v0.16b, v31.16b
+    ld1             {v1.16b}, x2, x3
+    and             v1.16b, v1.16b, v31.16b
+    ld1             {v2.16b}, x0, x1
+    and             v2.16b, v2.16b, v31.16b
+    ld1             {v3.16b}, x2, x3
+    and             v3.16b, v3.16b, v31.16b
+    uabd            v20.16b, v0.16b, v1.16b
+    uadalp          v16.8h, v20.16b
+    uabd            v21.16b, v2.16b, v3.16b
+    uadalp          v17.8h, v21.16b
+.endm
+
+.macro SAD_END_12
+    add             v16.8h, v16.8h, v17.8h
+    uaddlv          s0, v16.8h
+    fmov            w0, s0
+    ret
+.endm
+
+.macro SAD_START_24
+    movi            v16.16b, #0
+    movi            v17.16b, #0
+    sub             x1, x1, #16
+    sub             x3, x3, #16
+.endm
+
+.macro SAD_24
+    ld1             {v0.16b}, x0, #16
+    ld1             {v1.8b}, x0, x1
+    ld1             {v2.16b}, x2, #16
+    ld1             {v3.8b}, x2, x3
+    ld1             {v4.16b}, x0, #16
+    ld1             {v5.8b}, x0, x1
+    ld1             {v6.16b}, x2, #16
+    ld1             {v7.8b}, x2, x3
+    uabd            v20.16b, v0.16b, v2.16b
+    uadalp          v16.8h, v20.16b
+    uabal           v17.8h, v1.8b, v3.8b
+    uabd            v20.16b, v4.16b, v6.16b
​

x265_4.0.tar.gz/source/common/aarch64/sad-neon-dotprod.S Added

@@ -0,0 +1,330 @@
+/*****************************************************************************
+ * Copyright (C) 2024 MulticoreWare, Inc
+ *
+ * Authors: Hari Limaye <hari.limaye@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.arch armv8.2-a+dotprod
+
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
+.section .rodata
+#endif
+
+.align 4
+
+.text
+
+// Fully unrolled with single accumulator for smaller block heights.
+.macro SAD_NEON_DOTPROD_16_S h
+function PFX(pixel_sad_16x\h\()_neon_dotprod)
+    movi            v0.16b, #0
+    movi            v1.16b, #1
+.rept \h - 2
+    ldr             q2, x0
+    ldr             q3, x2
+    add             x0, x0, x1
+    add             x2, x2, x3
+    uabd            v4.16b, v2.16b, v3.16b
+    udot            v0.4s, v4.16b, v1.16b
+.endr
+    ldr             q2, x0
+    ldr             q3, x2
+    uabd            v4.16b, v2.16b, v3.16b
+    udot            v0.4s, v4.16b, v1.16b
+    ldr             q2, x0, x1
+    ldr             q3, x2, x3
+    uabd            v4.16b, v2.16b, v3.16b
+    udot            v0.4s, v4.16b, v1.16b
+
+    addv            s0, v0.4s
+    fmov            w0, s0
+    ret
+endfunc
+.endm
+
+.macro SAD_NEON_DOTPROD_START
+    // v31: 1 across all lanes for use in UDOT instructions.
+    movi            v31.16b, #1
+    movi            v16.16b, #0
+    movi            v17.16b, #0
+.endm
+
+.macro SAD_NEON_DOTPROD_END
+    add             v16.4s, v16.4s, v17.4s
+    addv            s0, v16.4s
+    fmov            w0, s0
+    ret
+.endm
+
+// Fully unrolled.
+.macro SAD_NEON_DOTPROD_16 h
+function PFX(pixel_sad_16x\h\()_neon_dotprod)
+    SAD_NEON_DOTPROD_START
+.rept \h / 2
+    ld1             {v0.16b}, x0, x1
+    ld1             {v1.16b}, x0, x1
+    ld1             {v2.16b}, x2, x3
+    ld1             {v3.16b}, x2, x3
+    uabd            v20.16b, v0.16b, v2.16b
+    udot            v16.4s, v20.16b, v31.16b
+    uabd            v21.16b, v1.16b, v3.16b
+    udot            v17.4s, v21.16b, v31.16b
+.endr
+    SAD_NEON_DOTPROD_END
+endfunc
+.endm
+
+// Process four rows of width 32.
+.macro SAD_NEON_DOTPROD_32
+.rept 4
+    ld1             {v0.16b-v1.16b}, x0, x1
+    ld1             {v2.16b-v3.16b}, x2, x3
+    uabd            v20.16b, v0.16b, v2.16b
+    udot            v16.4s, v20.16b, v31.16b
+    uabd            v21.16b, v1.16b, v3.16b
+    udot            v17.4s, v21.16b, v31.16b
+.endr
+.endm
+
+// Process four rows of width 48.
+.macro SAD_NEON_DOTPROD_48
+.rept 4
+    ld1             {v0.16b-v2.16b}, x0, x1
+    ld1             {v4.16b-v6.16b}, x2, x3
+    uabd            v20.16b, v0.16b, v4.16b
+    udot            v16.4s, v20.16b, v31.16b
+    uabd            v21.16b, v1.16b, v5.16b
+    udot            v17.4s, v21.16b, v31.16b
+    uabd            v20.16b, v2.16b, v6.16b
+    udot            v16.4s, v20.16b, v31.16b
+.endr
+.endm
+
+// Process four rows of width 64.
+.macro SAD_NEON_DOTPROD_64
+.rept 4
+    ld1             {v0.16b-v3.16b}, x0, x1
+    ld1             {v4.16b-v7.16b}, x2, x3
+    uabd            v20.16b, v0.16b, v4.16b
+    udot            v16.4s, v20.16b, v31.16b
+    uabd            v21.16b, v1.16b, v5.16b
+    udot            v17.4s, v21.16b, v31.16b
+    uabd            v20.16b, v2.16b, v6.16b
+    udot            v16.4s, v20.16b, v31.16b
+    uabd            v21.16b, v3.16b, v7.16b
+    udot            v17.4s, v21.16b, v31.16b
+.endr
+.endm
+
+// Loop unrolled to process 4 rows per iteration.
+.macro SAD_NEON_DOTPROD_LOOP w, h
+function PFX(pixel_sad_\w\()x\h\()_neon_dotprod)
+    SAD_NEON_DOTPROD_START
+    mov             w9, #\h/4
+.Loop_\w\()x\h:
+    sub             w9, w9, #1
+
+    SAD_NEON_DOTPROD_\w
+
+    cbnz            w9, .Loop_\w\()x\h
+    SAD_NEON_DOTPROD_END
+endfunc
+.endm
+
+SAD_NEON_DOTPROD_16_S 4
+SAD_NEON_DOTPROD_16_S 8
+SAD_NEON_DOTPROD_16_S 12
+SAD_NEON_DOTPROD_16_S 16
+SAD_NEON_DOTPROD_16 32
+SAD_NEON_DOTPROD_16 64
+SAD_NEON_DOTPROD_LOOP  32, 8
+SAD_NEON_DOTPROD_LOOP  32, 16
+SAD_NEON_DOTPROD_LOOP  32, 24
+SAD_NEON_DOTPROD_LOOP  32, 32
+SAD_NEON_DOTPROD_LOOP  32, 64
+SAD_NEON_DOTPROD_LOOP  48, 64
+SAD_NEON_DOTPROD_LOOP  64, 16
+SAD_NEON_DOTPROD_LOOP  64, 32
+SAD_NEON_DOTPROD_LOOP  64, 48
+SAD_NEON_DOTPROD_LOOP  64, 64
+
+.macro PREP_ARGS_SAD_X_NEON_DOTPROD x
+    mov             x9, #FENC_STRIDE
+
+// Make function arguments for x == 3 look like x == 4.
+.if \x == 3
+    mov             x6, x5
+    mov             x5, x4
+.endif
+
+    // v31: 1 across all lanes for use in UDOT instructions.
+    movi            v31.16b, #1
+.endm
+
+.macro SAD_X_NEON_DOTPROD_START x
+    movi v16.4s, #0
+    movi v17.4s, #0
+    movi v18.4s, #0
+.if \x == 4
+    movi v19.4s, #0
+.endif
+.endm
+
+.macro SAD_X_NEON_DOTPROD_END x
+.if \x == 3
+    addv            s0, v16.4s
+    addv            s1, v17.4s
+    addv            s2, v18.4s
+    stp             s0, s1, x6

 
@@ -0,0 +1,330 @@
+/*****************************************************************************
+ * Copyright (C) 2024 MulticoreWare, Inc
+ *
+ * Authors: Hari Limaye <hari.limaye@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.arch armv8.2-a+dotprod
+
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
+.section .rodata
+#endif
+
+.align 4
+
+.text
+
+// Fully unrolled with single accumulator for smaller block heights.
+.macro SAD_NEON_DOTPROD_16_S h
+function PFX(pixel_sad_16x\h\()_neon_dotprod)
+    movi            v0.16b, #0
+    movi            v1.16b, #1
+.rept \h - 2
+    ldr             q2, x0
+    ldr             q3, x2
+    add             x0, x0, x1
+    add             x2, x2, x3
+    uabd            v4.16b, v2.16b, v3.16b
+    udot            v0.4s, v4.16b, v1.16b
+.endr
+    ldr             q2, x0
+    ldr             q3, x2
+    uabd            v4.16b, v2.16b, v3.16b
+    udot            v0.4s, v4.16b, v1.16b
+    ldr             q2, x0, x1
+    ldr             q3, x2, x3
+    uabd            v4.16b, v2.16b, v3.16b
+    udot            v0.4s, v4.16b, v1.16b
+
+    addv            s0, v0.4s
+    fmov            w0, s0
+    ret
+endfunc
+.endm
+
+.macro SAD_NEON_DOTPROD_START
+    // v31: 1 across all lanes for use in UDOT instructions.
+    movi            v31.16b, #1
+    movi            v16.16b, #0
+    movi            v17.16b, #0
+.endm
+
+.macro SAD_NEON_DOTPROD_END
+    add             v16.4s, v16.4s, v17.4s
+    addv            s0, v16.4s
+    fmov            w0, s0
+    ret
+.endm
+
+// Fully unrolled.
+.macro SAD_NEON_DOTPROD_16 h
+function PFX(pixel_sad_16x\h\()_neon_dotprod)
+    SAD_NEON_DOTPROD_START
+.rept \h / 2
+    ld1             {v0.16b}, x0, x1
+    ld1             {v1.16b}, x0, x1
+    ld1             {v2.16b}, x2, x3
+    ld1             {v3.16b}, x2, x3
+    uabd            v20.16b, v0.16b, v2.16b
+    udot            v16.4s, v20.16b, v31.16b
+    uabd            v21.16b, v1.16b, v3.16b
+    udot            v17.4s, v21.16b, v31.16b
+.endr
+    SAD_NEON_DOTPROD_END
+endfunc
+.endm
+
+// Process four rows of width 32.
+.macro SAD_NEON_DOTPROD_32
+.rept 4
+    ld1             {v0.16b-v1.16b}, x0, x1
+    ld1             {v2.16b-v3.16b}, x2, x3
+    uabd            v20.16b, v0.16b, v2.16b
+    udot            v16.4s, v20.16b, v31.16b
+    uabd            v21.16b, v1.16b, v3.16b
+    udot            v17.4s, v21.16b, v31.16b
+.endr
+.endm
+
+// Process four rows of width 48.
+.macro SAD_NEON_DOTPROD_48
+.rept 4
+    ld1             {v0.16b-v2.16b}, x0, x1
+    ld1             {v4.16b-v6.16b}, x2, x3
+    uabd            v20.16b, v0.16b, v4.16b
+    udot            v16.4s, v20.16b, v31.16b
+    uabd            v21.16b, v1.16b, v5.16b
+    udot            v17.4s, v21.16b, v31.16b
+    uabd            v20.16b, v2.16b, v6.16b
+    udot            v16.4s, v20.16b, v31.16b
+.endr
+.endm
+
+// Process four rows of width 64.
+.macro SAD_NEON_DOTPROD_64
+.rept 4
+    ld1             {v0.16b-v3.16b}, x0, x1
+    ld1             {v4.16b-v7.16b}, x2, x3
+    uabd            v20.16b, v0.16b, v4.16b
+    udot            v16.4s, v20.16b, v31.16b
+    uabd            v21.16b, v1.16b, v5.16b
+    udot            v17.4s, v21.16b, v31.16b
+    uabd            v20.16b, v2.16b, v6.16b
+    udot            v16.4s, v20.16b, v31.16b
+    uabd            v21.16b, v3.16b, v7.16b
+    udot            v17.4s, v21.16b, v31.16b
+.endr
+.endm
+
+// Loop unrolled to process 4 rows per iteration.
+.macro SAD_NEON_DOTPROD_LOOP w, h
+function PFX(pixel_sad_\w\()x\h\()_neon_dotprod)
+    SAD_NEON_DOTPROD_START
+    mov             w9, #\h/4
+.Loop_\w\()x\h:
+    sub             w9, w9, #1
+
+    SAD_NEON_DOTPROD_\w
+
+    cbnz            w9, .Loop_\w\()x\h
+    SAD_NEON_DOTPROD_END
+endfunc
+.endm
+
+SAD_NEON_DOTPROD_16_S 4
+SAD_NEON_DOTPROD_16_S 8
+SAD_NEON_DOTPROD_16_S 12
+SAD_NEON_DOTPROD_16_S 16
+SAD_NEON_DOTPROD_16 32
+SAD_NEON_DOTPROD_16 64
+SAD_NEON_DOTPROD_LOOP  32, 8
+SAD_NEON_DOTPROD_LOOP  32, 16
+SAD_NEON_DOTPROD_LOOP  32, 24
+SAD_NEON_DOTPROD_LOOP  32, 32
+SAD_NEON_DOTPROD_LOOP  32, 64
+SAD_NEON_DOTPROD_LOOP  48, 64
+SAD_NEON_DOTPROD_LOOP  64, 16
+SAD_NEON_DOTPROD_LOOP  64, 32
+SAD_NEON_DOTPROD_LOOP  64, 48
+SAD_NEON_DOTPROD_LOOP  64, 64
+
+.macro PREP_ARGS_SAD_X_NEON_DOTPROD x
+    mov             x9, #FENC_STRIDE
+
+// Make function arguments for x == 3 look like x == 4.
+.if \x == 3
+    mov             x6, x5
+    mov             x5, x4
+.endif
+
+    // v31: 1 across all lanes for use in UDOT instructions.
+    movi            v31.16b, #1
+.endm
+
+.macro SAD_X_NEON_DOTPROD_START x
+    movi v16.4s, #0
+    movi v17.4s, #0
+    movi v18.4s, #0
+.if \x == 4
+    movi v19.4s, #0
+.endif
+.endm
+
+.macro SAD_X_NEON_DOTPROD_END x
+.if \x == 3
+    addv            s0, v16.4s
+    addv            s1, v17.4s
+    addv            s2, v18.4s
+    stp             s0, s1, x6
​

x265_4.0.tar.gz/source/common/aarch64/sao-prim-sve.cpp Added

@@ -0,0 +1,271 @@
+/*****************************************************************************
+ * Copyright (C) 2024 MulticoreWare, Inc
+ *
+ * Authors: Hari Limaye <hari.limaye@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "sao-prim.h"
+
+/*
+ * Compute Edge Offset statistics (count and stats).
+ * To save some instructions compute count and stats as negative values - since
+ * output of Neon comparison instructions for a matched condition is all 1s (-1).
+ */
+static inline void compute_eo_stats(const int8x16_t edge_type,
+                                    const int16_t *diff, int16x8_t *count,
+                                    int64x2_t *stats)
+{
+    // Create a mask for each edge type.
+    int8x16_t mask0 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(-2)));
+    int8x16_t mask1 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(-1)));
+    int8x16_t mask2 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(0)));
+    int8x16_t mask3 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(1)));
+    int8x16_t mask4 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(2)));
+
+    // Compute negative counts for each edge type.
+    count0 = vpadalq_s8(count0, mask0);
+    count1 = vpadalq_s8(count1, mask1);
+    count2 = vpadalq_s8(count2, mask2);
+    count3 = vpadalq_s8(count3, mask3);
+    count4 = vpadalq_s8(count4, mask4);
+
+    // Widen the masks to 16-bit.
+    int16x8_t mask0_lo = vreinterpretq_s16_s8(vzip1q_s8(mask0, mask0));
+    int16x8_t mask0_hi = vreinterpretq_s16_s8(vzip2q_s8(mask0, mask0));
+    int16x8_t mask1_lo = vreinterpretq_s16_s8(vzip1q_s8(mask1, mask1));
+    int16x8_t mask1_hi = vreinterpretq_s16_s8(vzip2q_s8(mask1, mask1));
+    int16x8_t mask2_lo = vreinterpretq_s16_s8(vzip1q_s8(mask2, mask2));
+    int16x8_t mask2_hi = vreinterpretq_s16_s8(vzip2q_s8(mask2, mask2));
+    int16x8_t mask3_lo = vreinterpretq_s16_s8(vzip1q_s8(mask3, mask3));
+    int16x8_t mask3_hi = vreinterpretq_s16_s8(vzip2q_s8(mask3, mask3));
+    int16x8_t mask4_lo = vreinterpretq_s16_s8(vzip1q_s8(mask4, mask4));
+    int16x8_t mask4_hi = vreinterpretq_s16_s8(vzip2q_s8(mask4, mask4));
+
+    int16x8_t diff_lo = vld1q_s16(diff);
+    int16x8_t diff_hi = vld1q_s16(diff + 8);
+
+    // Compute negative stats for each edge type.
+    stats0 = x265_sdotq_s16(stats0, diff_lo, mask0_lo);
+    stats0 = x265_sdotq_s16(stats0, diff_hi, mask0_hi);
+    stats1 = x265_sdotq_s16(stats1, diff_lo, mask1_lo);
+    stats1 = x265_sdotq_s16(stats1, diff_hi, mask1_hi);
+    stats2 = x265_sdotq_s16(stats2, diff_lo, mask2_lo);
+    stats2 = x265_sdotq_s16(stats2, diff_hi, mask2_hi);
+    stats3 = x265_sdotq_s16(stats3, diff_lo, mask3_lo);
+    stats3 = x265_sdotq_s16(stats3, diff_hi, mask3_hi);
+    stats4 = x265_sdotq_s16(stats4, diff_lo, mask4_lo);
+    stats4 = x265_sdotq_s16(stats4, diff_hi, mask4_hi);
+}
+
+/*
+ * Reduce and store Edge Offset statistics (count and stats).
+ */
+static inline void reduce_eo_stats(int64x2_t *vstats, int16x8_t *vcount,
+                                   int32_t *stats, int32_t *count)
+{
+    // s_eoTable maps edge types to memory in order: {2, 0, 1, 3, 4}.
+    int16x8_t c01 = vpaddq_s16(vcount2, vcount0);
+    int16x8_t c23 = vpaddq_s16(vcount1, vcount3);
+    int16x8_t c0123 = vpaddq_s16(c01, c23);
+    // Subtract from current count, as we calculate the negation.
+    vst1q_s32(count, vsubq_s32(vld1q_s32(count), vpaddlq_s16(c0123)));
+    count4 -= vaddvq_s16(vcount4);
+
+    int32x4_t s01 = vcombine_s32(vmovn_s64(vstats2), vmovn_s64(vstats0));
+    int32x4_t s23 = vcombine_s32(vmovn_s64(vstats1), vmovn_s64(vstats3));
+    int32x4_t s0123 = vpaddq_s32(s01, s23);
+    // Subtract from current stats, as we calculate the negation.
+    vst1q_s32(stats, vsubq_s32(vld1q_s32(stats), s0123));
+    stats4 -= vaddvq_s64(vstats4);
+}
+
+namespace X265_NS {
+void saoCuStatsE0_sve(const int16_t *diff, const pixel *rec, intptr_t stride,
+                      int endX, int endY, int32_t *stats, int32_t *count)
+{
+    // Separate buffers for each edge type, so that we can vectorise.
+    int16x8_t tmp_count5 = { vdupq_n_s16(0), vdupq_n_s16(0), vdupq_n_s16(0),
+                               vdupq_n_s16(0), vdupq_n_s16(0) };
+    int64x2_t tmp_stats5 = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0),
+                               vdupq_n_s64(0), vdupq_n_s64(0) };
+
+    for (int y = 0; y < endY; y++)
+    {
+        // Calculate negated sign_left(x) directly, to save negation when
+        // reusing sign_right(x) as sign_left(x + 1).
+        int8x16_t neg_sign_left = vdupq_n_s8(x265_signOf(rec-1 - rec0));
+        for (int x = 0; x < endX; x += 16)
+        {
+            int8x16_t sign_right = signOf_neon(rec + x, rec + x + 1);
+
+            // neg_sign_left(x) = sign_right(x + 1), reusing one from previous
+            // iteration.
+            neg_sign_left = vextq_s8(neg_sign_left, sign_right, 15);
+
+            // Subtract instead of add, as sign_left is negated.
+            int8x16_t edge_type = vsubq_s8(sign_right, neg_sign_left);
+
+            // For reuse in the next iteration.
+            neg_sign_left = sign_right;
+
+            edge_type = x265_sve_mask(x, endX, edge_type);
+            compute_eo_stats(edge_type, diff + x, tmp_count, tmp_stats);
+        }
+
+        diff += MAX_CU_SIZE;
+        rec += stride;
+    }
+
+    reduce_eo_stats(tmp_stats, tmp_count, stats, count);
+}
+
+void saoCuStatsE1_sve(const int16_t *diff, const pixel *rec, intptr_t stride,
+                      int8_t *upBuff1, int endX, int endY, int32_t *stats,
+                      int32_t *count)
+{
+    // Separate buffers for each edge type, so that we can vectorise.
+    int16x8_t tmp_count5 = { vdupq_n_s16(0), vdupq_n_s16(0), vdupq_n_s16(0),
+                               vdupq_n_s16(0), vdupq_n_s16(0) };
+    int64x2_t tmp_stats5 = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0),
+                               vdupq_n_s64(0), vdupq_n_s64(0) };
+
+    // Negate upBuff1 (sign_up), so we can subtract and save repeated negations.
+    for (int x = 0; x < endX; x += 16)
+    {
+        vst1q_s8(upBuff1 + x, vnegq_s8(vld1q_s8(upBuff1 + x)));
+    }
+
+    for (int y = 0; y < endY; y++)
+    {
+        for (int x = 0; x < endX; x += 16)
+        {
+            int8x16_t sign_up = vld1q_s8(upBuff1 + x);
+            int8x16_t sign_down = signOf_neon(rec + x, rec + x + stride);
+
+            // Subtract instead of add, as sign_up is negated.
+            int8x16_t edge_type = vsubq_s8(sign_down, sign_up);
+
+            // For reuse in the next iteration.
+            vst1q_s8(upBuff1 + x, sign_down);
+
+            edge_type = x265_sve_mask(x, endX, edge_type);
+            compute_eo_stats(edge_type, diff + x, tmp_count, tmp_stats);
+        }
+
+        diff += MAX_CU_SIZE;
+        rec += stride;
+    }
+
+    reduce_eo_stats(tmp_stats, tmp_count, stats, count);
+}
+
+void saoCuStatsE2_sve(const int16_t *diff, const pixel *rec, intptr_t stride,
+                      int8_t *upBuff1, int8_t *upBufft, int endX, int endY,
+                      int32_t *stats, int32_t *count)
+{
+    // Separate buffers for each edge type, so that we can vectorise.
+    int16x8_t tmp_count5 = { vdupq_n_s16(0), vdupq_n_s16(0), vdupq_n_s16(0),
+                               vdupq_n_s16(0), vdupq_n_s16(0) };
+    int64x2_t tmp_stats5 = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0),
+                               vdupq_n_s64(0), vdupq_n_s64(0) };
+
+    // Negate upBuff1 (sign_up) so we can subtract and save repeated negations.
+    for (int x = 0; x < endX; x += 16)
+    {
+        vst1q_s8(upBuff1 + x, vnegq_s8(vld1q_s8(upBuff1 + x)));
+    }
+
+    for (int y = 0; y < endY; y++)
+    {
+        upBufft0 = x265_signOf(rec-1 - recstride);
+        for (int x = 0; x < endX; x += 16)
+        {

 
@@ -0,0 +1,271 @@
+/*****************************************************************************
+ * Copyright (C) 2024 MulticoreWare, Inc
+ *
+ * Authors: Hari Limaye <hari.limaye@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "sao-prim.h"
+
+/*
+ * Compute Edge Offset statistics (count and stats).
+ * To save some instructions compute count and stats as negative values - since
+ * output of Neon comparison instructions for a matched condition is all 1s (-1).
+ */
+static inline void compute_eo_stats(const int8x16_t edge_type,
+                                    const int16_t *diff, int16x8_t *count,
+                                    int64x2_t *stats)
+{
+    // Create a mask for each edge type.
+    int8x16_t mask0 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(-2)));
+    int8x16_t mask1 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(-1)));
+    int8x16_t mask2 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(0)));
+    int8x16_t mask3 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(1)));
+    int8x16_t mask4 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(2)));
+
+    // Compute negative counts for each edge type.
+    count0 = vpadalq_s8(count0, mask0);
+    count1 = vpadalq_s8(count1, mask1);
+    count2 = vpadalq_s8(count2, mask2);
+    count3 = vpadalq_s8(count3, mask3);
+    count4 = vpadalq_s8(count4, mask4);
+
+    // Widen the masks to 16-bit.
+    int16x8_t mask0_lo = vreinterpretq_s16_s8(vzip1q_s8(mask0, mask0));
+    int16x8_t mask0_hi = vreinterpretq_s16_s8(vzip2q_s8(mask0, mask0));
+    int16x8_t mask1_lo = vreinterpretq_s16_s8(vzip1q_s8(mask1, mask1));
+    int16x8_t mask1_hi = vreinterpretq_s16_s8(vzip2q_s8(mask1, mask1));
+    int16x8_t mask2_lo = vreinterpretq_s16_s8(vzip1q_s8(mask2, mask2));
+    int16x8_t mask2_hi = vreinterpretq_s16_s8(vzip2q_s8(mask2, mask2));
+    int16x8_t mask3_lo = vreinterpretq_s16_s8(vzip1q_s8(mask3, mask3));
+    int16x8_t mask3_hi = vreinterpretq_s16_s8(vzip2q_s8(mask3, mask3));
+    int16x8_t mask4_lo = vreinterpretq_s16_s8(vzip1q_s8(mask4, mask4));
+    int16x8_t mask4_hi = vreinterpretq_s16_s8(vzip2q_s8(mask4, mask4));
+
+    int16x8_t diff_lo = vld1q_s16(diff);
+    int16x8_t diff_hi = vld1q_s16(diff + 8);
+
+    // Compute negative stats for each edge type.
+    stats0 = x265_sdotq_s16(stats0, diff_lo, mask0_lo);
+    stats0 = x265_sdotq_s16(stats0, diff_hi, mask0_hi);
+    stats1 = x265_sdotq_s16(stats1, diff_lo, mask1_lo);
+    stats1 = x265_sdotq_s16(stats1, diff_hi, mask1_hi);
+    stats2 = x265_sdotq_s16(stats2, diff_lo, mask2_lo);
+    stats2 = x265_sdotq_s16(stats2, diff_hi, mask2_hi);
+    stats3 = x265_sdotq_s16(stats3, diff_lo, mask3_lo);
+    stats3 = x265_sdotq_s16(stats3, diff_hi, mask3_hi);
+    stats4 = x265_sdotq_s16(stats4, diff_lo, mask4_lo);
+    stats4 = x265_sdotq_s16(stats4, diff_hi, mask4_hi);
+}
+
+/*
+ * Reduce and store Edge Offset statistics (count and stats).
+ */
+static inline void reduce_eo_stats(int64x2_t *vstats, int16x8_t *vcount,
+                                   int32_t *stats, int32_t *count)
+{
+    // s_eoTable maps edge types to memory in order: {2, 0, 1, 3, 4}.
+    int16x8_t c01 = vpaddq_s16(vcount2, vcount0);
+    int16x8_t c23 = vpaddq_s16(vcount1, vcount3);
+    int16x8_t c0123 = vpaddq_s16(c01, c23);
+    // Subtract from current count, as we calculate the negation.
+    vst1q_s32(count, vsubq_s32(vld1q_s32(count), vpaddlq_s16(c0123)));
+    count4 -= vaddvq_s16(vcount4);
+
+    int32x4_t s01 = vcombine_s32(vmovn_s64(vstats2), vmovn_s64(vstats0));
+    int32x4_t s23 = vcombine_s32(vmovn_s64(vstats1), vmovn_s64(vstats3));
+    int32x4_t s0123 = vpaddq_s32(s01, s23);
+    // Subtract from current stats, as we calculate the negation.
+    vst1q_s32(stats, vsubq_s32(vld1q_s32(stats), s0123));
+    stats4 -= vaddvq_s64(vstats4);
+}
+
+namespace X265_NS {
+void saoCuStatsE0_sve(const int16_t *diff, const pixel *rec, intptr_t stride,
+                      int endX, int endY, int32_t *stats, int32_t *count)
+{
+    // Separate buffers for each edge type, so that we can vectorise.
+    int16x8_t tmp_count5 = { vdupq_n_s16(0), vdupq_n_s16(0), vdupq_n_s16(0),
+                               vdupq_n_s16(0), vdupq_n_s16(0) };
+    int64x2_t tmp_stats5 = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0),
+                               vdupq_n_s64(0), vdupq_n_s64(0) };
+
+    for (int y = 0; y < endY; y++)
+    {
+        // Calculate negated sign_left(x) directly, to save negation when
+        // reusing sign_right(x) as sign_left(x + 1).
+        int8x16_t neg_sign_left = vdupq_n_s8(x265_signOf(rec-1 - rec0));
+        for (int x = 0; x < endX; x += 16)
+        {
+            int8x16_t sign_right = signOf_neon(rec + x, rec + x + 1);
+
+            // neg_sign_left(x) = sign_right(x + 1), reusing one from previous
+            // iteration.
+            neg_sign_left = vextq_s8(neg_sign_left, sign_right, 15);
+
+            // Subtract instead of add, as sign_left is negated.
+            int8x16_t edge_type = vsubq_s8(sign_right, neg_sign_left);
+
+            // For reuse in the next iteration.
+            neg_sign_left = sign_right;
+
+            edge_type = x265_sve_mask(x, endX, edge_type);
+            compute_eo_stats(edge_type, diff + x, tmp_count, tmp_stats);
+        }
+
+        diff += MAX_CU_SIZE;
+        rec += stride;
+    }
+
+    reduce_eo_stats(tmp_stats, tmp_count, stats, count);
+}
+
+void saoCuStatsE1_sve(const int16_t *diff, const pixel *rec, intptr_t stride,
+                      int8_t *upBuff1, int endX, int endY, int32_t *stats,
+                      int32_t *count)
+{
+    // Separate buffers for each edge type, so that we can vectorise.
+    int16x8_t tmp_count5 = { vdupq_n_s16(0), vdupq_n_s16(0), vdupq_n_s16(0),
+                               vdupq_n_s16(0), vdupq_n_s16(0) };
+    int64x2_t tmp_stats5 = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0),
+                               vdupq_n_s64(0), vdupq_n_s64(0) };
+
+    // Negate upBuff1 (sign_up), so we can subtract and save repeated negations.
+    for (int x = 0; x < endX; x += 16)
+    {
+        vst1q_s8(upBuff1 + x, vnegq_s8(vld1q_s8(upBuff1 + x)));
+    }
+
+    for (int y = 0; y < endY; y++)
+    {
+        for (int x = 0; x < endX; x += 16)
+        {
+            int8x16_t sign_up = vld1q_s8(upBuff1 + x);
+            int8x16_t sign_down = signOf_neon(rec + x, rec + x + stride);
+
+            // Subtract instead of add, as sign_up is negated.
+            int8x16_t edge_type = vsubq_s8(sign_down, sign_up);
+
+            // For reuse in the next iteration.
+            vst1q_s8(upBuff1 + x, sign_down);
+
+            edge_type = x265_sve_mask(x, endX, edge_type);
+            compute_eo_stats(edge_type, diff + x, tmp_count, tmp_stats);
+        }
+
+        diff += MAX_CU_SIZE;
+        rec += stride;
+    }
+
+    reduce_eo_stats(tmp_stats, tmp_count, stats, count);
+}
+
+void saoCuStatsE2_sve(const int16_t *diff, const pixel *rec, intptr_t stride,
+                      int8_t *upBuff1, int8_t *upBufft, int endX, int endY,
+                      int32_t *stats, int32_t *count)
+{
+    // Separate buffers for each edge type, so that we can vectorise.
+    int16x8_t tmp_count5 = { vdupq_n_s16(0), vdupq_n_s16(0), vdupq_n_s16(0),
+                               vdupq_n_s16(0), vdupq_n_s16(0) };
+    int64x2_t tmp_stats5 = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0),
+                               vdupq_n_s64(0), vdupq_n_s64(0) };
+
+    // Negate upBuff1 (sign_up) so we can subtract and save repeated negations.
+    for (int x = 0; x < endX; x += 16)
+    {
+        vst1q_s8(upBuff1 + x, vnegq_s8(vld1q_s8(upBuff1 + x)));
+    }
+
+    for (int y = 0; y < endY; y++)
+    {
+        upBufft0 = x265_signOf(rec-1 - recstride);
+        for (int x = 0; x < endX; x += 16)
+        {
​

x265_4.0.tar.gz/source/common/aarch64/sao-prim-sve2.cpp Added

@@ -0,0 +1,317 @@
+/*****************************************************************************
+ * Copyright (C) 2024 MulticoreWare, Inc
+ *
+ * Authors: Hari Limaye <hari.limaye@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "sao-prim.h"
+
+static inline uint8x16_t sve_count(int8x16_t in)
+{
+    // We do not care about initialising the values in the rest of the vector,
+    // for VL > 128, as HISTSEG counts matching elements in 128-bit segments.
+    svint8_t edge_type = svset_neonq_s8(svundef_s8(), in);
+
+    // Use an arbitrary value outside of range -2, 2 for lanes we don't
+    // need to use the result from.
+    const int DC = -3;
+    // s_eoTable maps edge types to memory in order: {2, 0, 1, 3, 4}.
+    // We use (edge_class - 2) resulting in   {0, -2, -1, 1, 2}
+    int8x16_t idx = { 0, -2, -1, 1, 2, DC, DC, DC, DC, DC, DC, DC, DC, DC, DC,
+                      DC };
+    svint8_t svidx = svset_neonq_s8(svundef_s8(), idx);
+
+    svuint8_t count = svhistseg_s8(svidx, edge_type);
+    return svget_neonq_u8(count);
+}
+
+/*
+ * Compute Edge Offset statistics (stats array).
+ * To save some instructions compute stats as negative values - since output of
+ * Neon comparison instructions for a matched condition is all 1s (-1).
+ */
+static inline void compute_eo_stats(const int8x16_t edge_type,
+                                    const int16_t *diff, int64x2_t *stats)
+{
+    // Create a mask for each edge type.
+    int8x16_t mask0 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(-2)));
+    int8x16_t mask1 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(-1)));
+    int8x16_t mask2 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(0)));
+    int8x16_t mask3 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(1)));
+    int8x16_t mask4 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(2)));
+
+    // Widen the masks to 16-bit.
+    int16x8_t mask0_lo = vreinterpretq_s16_s8(vzip1q_s8(mask0, mask0));
+    int16x8_t mask0_hi = vreinterpretq_s16_s8(vzip2q_s8(mask0, mask0));
+    int16x8_t mask1_lo = vreinterpretq_s16_s8(vzip1q_s8(mask1, mask1));
+    int16x8_t mask1_hi = vreinterpretq_s16_s8(vzip2q_s8(mask1, mask1));
+    int16x8_t mask2_lo = vreinterpretq_s16_s8(vzip1q_s8(mask2, mask2));
+    int16x8_t mask2_hi = vreinterpretq_s16_s8(vzip2q_s8(mask2, mask2));
+    int16x8_t mask3_lo = vreinterpretq_s16_s8(vzip1q_s8(mask3, mask3));
+    int16x8_t mask3_hi = vreinterpretq_s16_s8(vzip2q_s8(mask3, mask3));
+    int16x8_t mask4_lo = vreinterpretq_s16_s8(vzip1q_s8(mask4, mask4));
+    int16x8_t mask4_hi = vreinterpretq_s16_s8(vzip2q_s8(mask4, mask4));
+
+    int16x8_t diff_lo = vld1q_s16(diff);
+    int16x8_t diff_hi = vld1q_s16(diff + 8);
+
+    // Compute negative stats for each edge type.
+    stats0 = x265_sdotq_s16(stats0, diff_lo, mask0_lo);
+    stats0 = x265_sdotq_s16(stats0, diff_hi, mask0_hi);
+    stats1 = x265_sdotq_s16(stats1, diff_lo, mask1_lo);
+    stats1 = x265_sdotq_s16(stats1, diff_hi, mask1_hi);
+    stats2 = x265_sdotq_s16(stats2, diff_lo, mask2_lo);
+    stats2 = x265_sdotq_s16(stats2, diff_hi, mask2_hi);
+    stats3 = x265_sdotq_s16(stats3, diff_lo, mask3_lo);
+    stats3 = x265_sdotq_s16(stats3, diff_hi, mask3_hi);
+    stats4 = x265_sdotq_s16(stats4, diff_lo, mask4_lo);
+    stats4 = x265_sdotq_s16(stats4, diff_hi, mask4_hi);
+}
+
+/*
+ * Reduce and store Edge Offset statistics (count and stats).
+ */
+static inline void reduce_eo_stats(int64x2_t *vstats, uint16x8_t vcount,
+                                   int32_t *stats, int32_t *count)
+{
+    // s_eoTable maps edge types to memory in order: {2, 0, 1, 3, 4}.
+    // We already have the count values in the correct order for the store,
+    // so widen to 32-bit and accumulate to the destination.
+    int32x4_t c0123 = vmovl_s16(vget_low_s16(vreinterpretq_s16_u16(vcount)));
+    vst1q_s32(count, vaddq_s32(vld1q_s32(count), c0123));
+    count4 += vcount4;
+
+    int32x4_t s01 = vcombine_s32(vmovn_s64(vstats2), vmovn_s64(vstats0));
+    int32x4_t s23 = vcombine_s32(vmovn_s64(vstats1), vmovn_s64(vstats3));
+    int32x4_t s0123 = vpaddq_s32(s01, s23);
+    // Subtract from current stats, as we calculate the negation.
+    vst1q_s32(stats, vsubq_s32(vld1q_s32(stats), s0123));
+    stats4 -= vaddvq_s64(vstats4);
+}
+
+namespace X265_NS {
+void saoCuStatsE0_sve2(const int16_t *diff, const pixel *rec, intptr_t stride,
+                       int endX, int endY, int32_t *stats, int32_t *count)
+{
+    // Separate buffers for each edge type, so that we can vectorise.
+    int64x2_t tmp_stats5 = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0),
+                               vdupq_n_s64(0), vdupq_n_s64(0) };
+    uint16x8_t count_acc_u16 = vdupq_n_u16(0);
+
+    for (int y = 0; y < endY; y++)
+    {
+        uint8x16_t count_acc_u8 = vdupq_n_u8(0);
+
+        // Calculate negated sign_left(x) directly, to save negation when
+        // reusing sign_right(x) as sign_left(x + 1).
+        int8x16_t neg_sign_left = vdupq_n_s8(x265_signOf(rec-1 - rec0));
+        for (int x = 0; x < endX; x += 16)
+        {
+            int8x16_t sign_right = signOf_neon(rec + x, rec + x + 1);
+
+            // neg_sign_left(x) = sign_right(x + 1), reusing one from previous
+            // iteration.
+            neg_sign_left = vextq_s8(neg_sign_left, sign_right, 15);
+
+            // Subtract instead of add, as sign_left is negated.
+            int8x16_t edge_type = vsubq_s8(sign_right, neg_sign_left);
+
+            // For reuse in the next iteration.
+            neg_sign_left = sign_right;
+
+            edge_type = x265_sve_mask(x, endX, edge_type);
+            count_acc_u8 = vaddq_u8(count_acc_u8, sve_count(edge_type));
+            compute_eo_stats(edge_type, diff + x, tmp_stats);
+        }
+
+        // The width (endX) can be a maximum of 64, so we can safely
+        // widen from 8-bit count accumulators after one inner loop iteration.
+        // Technically the largest an accumulator could reach after one inner
+        // loop iteration is 64, if every input value had the same edge type, so
+        // we could complete two iterations (2 * 64 = 128) before widening.
+        count_acc_u16 = vaddw_u8(count_acc_u16, vget_low_u8(count_acc_u8));
+
+        diff += MAX_CU_SIZE;
+        rec += stride;
+    }
+
+    reduce_eo_stats(tmp_stats, count_acc_u16, stats, count);
+}
+
+void saoCuStatsE1_sve2(const int16_t *diff, const pixel *rec, intptr_t stride,
+                       int8_t *upBuff1, int endX, int endY, int32_t *stats,
+                       int32_t *count)
+{
+    // Separate buffers for each edge type, so that we can vectorise.
+    int64x2_t tmp_stats5 = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0),
+                               vdupq_n_s64(0), vdupq_n_s64(0) };
+    uint16x8_t count_acc_u16 = vdupq_n_u16(0);
+
+    // Negate upBuff1 (sign_up), so we can subtract and save repeated negations.
+    for (int x = 0; x < endX; x += 16)
+    {
+        vst1q_s8(upBuff1 + x, vnegq_s8(vld1q_s8(upBuff1 + x)));
+    }
+
+    for (int y = 0; y < endY; y++)
+    {
+        uint8x16_t count_acc_u8 = vdupq_n_u8(0);
+
+        for (int x = 0; x < endX; x += 16)
+        {
+            int8x16_t sign_up = vld1q_s8(upBuff1 + x);
+            int8x16_t sign_down = signOf_neon(rec + x, rec + x + stride);
+
+            // Subtract instead of add, as sign_up is negated.
+            int8x16_t edge_type = vsubq_s8(sign_down, sign_up);
+
+            // For reuse in the next iteration.
+            vst1q_s8(upBuff1 + x, sign_down);
+
+            edge_type = x265_sve_mask(x, endX, edge_type);
+            count_acc_u8 = vaddq_u8(count_acc_u8, sve_count(edge_type));
+            compute_eo_stats(edge_type, diff + x, tmp_stats);
+        }
+
+        // The width (endX) can be a maximum of 64, so we can safely
+        // widen from 8-bit count accumulators after one inner loop iteration.
+        // Technically the largest an accumulator could reach after one inner
+        // loop iteration is 64, if every input value had the same edge type, so
+        // we could complete two iterations (2 * 64 = 128) before widening.
+        count_acc_u16 = vaddw_u8(count_acc_u16, vget_low_u8(count_acc_u8));
+

 
@@ -0,0 +1,317 @@
+/*****************************************************************************
+ * Copyright (C) 2024 MulticoreWare, Inc
+ *
+ * Authors: Hari Limaye <hari.limaye@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "sao-prim.h"
+
+static inline uint8x16_t sve_count(int8x16_t in)
+{
+    // We do not care about initialising the values in the rest of the vector,
+    // for VL > 128, as HISTSEG counts matching elements in 128-bit segments.
+    svint8_t edge_type = svset_neonq_s8(svundef_s8(), in);
+
+    // Use an arbitrary value outside of range -2, 2 for lanes we don't
+    // need to use the result from.
+    const int DC = -3;
+    // s_eoTable maps edge types to memory in order: {2, 0, 1, 3, 4}.
+    // We use (edge_class - 2) resulting in   {0, -2, -1, 1, 2}
+    int8x16_t idx = { 0, -2, -1, 1, 2, DC, DC, DC, DC, DC, DC, DC, DC, DC, DC,
+                      DC };
+    svint8_t svidx = svset_neonq_s8(svundef_s8(), idx);
+
+    svuint8_t count = svhistseg_s8(svidx, edge_type);
+    return svget_neonq_u8(count);
+}
+
+/*
+ * Compute Edge Offset statistics (stats array).
+ * To save some instructions compute stats as negative values - since output of
+ * Neon comparison instructions for a matched condition is all 1s (-1).
+ */
+static inline void compute_eo_stats(const int8x16_t edge_type,
+                                    const int16_t *diff, int64x2_t *stats)
+{
+    // Create a mask for each edge type.
+    int8x16_t mask0 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(-2)));
+    int8x16_t mask1 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(-1)));
+    int8x16_t mask2 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(0)));
+    int8x16_t mask3 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(1)));
+    int8x16_t mask4 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(2)));
+
+    // Widen the masks to 16-bit.
+    int16x8_t mask0_lo = vreinterpretq_s16_s8(vzip1q_s8(mask0, mask0));
+    int16x8_t mask0_hi = vreinterpretq_s16_s8(vzip2q_s8(mask0, mask0));
+    int16x8_t mask1_lo = vreinterpretq_s16_s8(vzip1q_s8(mask1, mask1));
+    int16x8_t mask1_hi = vreinterpretq_s16_s8(vzip2q_s8(mask1, mask1));
+    int16x8_t mask2_lo = vreinterpretq_s16_s8(vzip1q_s8(mask2, mask2));
+    int16x8_t mask2_hi = vreinterpretq_s16_s8(vzip2q_s8(mask2, mask2));
+    int16x8_t mask3_lo = vreinterpretq_s16_s8(vzip1q_s8(mask3, mask3));
+    int16x8_t mask3_hi = vreinterpretq_s16_s8(vzip2q_s8(mask3, mask3));
+    int16x8_t mask4_lo = vreinterpretq_s16_s8(vzip1q_s8(mask4, mask4));
+    int16x8_t mask4_hi = vreinterpretq_s16_s8(vzip2q_s8(mask4, mask4));
+
+    int16x8_t diff_lo = vld1q_s16(diff);
+    int16x8_t diff_hi = vld1q_s16(diff + 8);
+
+    // Compute negative stats for each edge type.
+    stats0 = x265_sdotq_s16(stats0, diff_lo, mask0_lo);
+    stats0 = x265_sdotq_s16(stats0, diff_hi, mask0_hi);
+    stats1 = x265_sdotq_s16(stats1, diff_lo, mask1_lo);
+    stats1 = x265_sdotq_s16(stats1, diff_hi, mask1_hi);
+    stats2 = x265_sdotq_s16(stats2, diff_lo, mask2_lo);
+    stats2 = x265_sdotq_s16(stats2, diff_hi, mask2_hi);
+    stats3 = x265_sdotq_s16(stats3, diff_lo, mask3_lo);
+    stats3 = x265_sdotq_s16(stats3, diff_hi, mask3_hi);
+    stats4 = x265_sdotq_s16(stats4, diff_lo, mask4_lo);
+    stats4 = x265_sdotq_s16(stats4, diff_hi, mask4_hi);
+}
+
+/*
+ * Reduce and store Edge Offset statistics (count and stats).
+ */
+static inline void reduce_eo_stats(int64x2_t *vstats, uint16x8_t vcount,
+                                   int32_t *stats, int32_t *count)
+{
+    // s_eoTable maps edge types to memory in order: {2, 0, 1, 3, 4}.
+    // We already have the count values in the correct order for the store,
+    // so widen to 32-bit and accumulate to the destination.
+    int32x4_t c0123 = vmovl_s16(vget_low_s16(vreinterpretq_s16_u16(vcount)));
+    vst1q_s32(count, vaddq_s32(vld1q_s32(count), c0123));
+    count4 += vcount4;
+
+    int32x4_t s01 = vcombine_s32(vmovn_s64(vstats2), vmovn_s64(vstats0));
+    int32x4_t s23 = vcombine_s32(vmovn_s64(vstats1), vmovn_s64(vstats3));
+    int32x4_t s0123 = vpaddq_s32(s01, s23);
+    // Subtract from current stats, as we calculate the negation.
+    vst1q_s32(stats, vsubq_s32(vld1q_s32(stats), s0123));
+    stats4 -= vaddvq_s64(vstats4);
+}
+
+namespace X265_NS {
+void saoCuStatsE0_sve2(const int16_t *diff, const pixel *rec, intptr_t stride,
+                       int endX, int endY, int32_t *stats, int32_t *count)
+{
+    // Separate buffers for each edge type, so that we can vectorise.
+    int64x2_t tmp_stats5 = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0),
+                               vdupq_n_s64(0), vdupq_n_s64(0) };
+    uint16x8_t count_acc_u16 = vdupq_n_u16(0);
+
+    for (int y = 0; y < endY; y++)
+    {
+        uint8x16_t count_acc_u8 = vdupq_n_u8(0);
+
+        // Calculate negated sign_left(x) directly, to save negation when
+        // reusing sign_right(x) as sign_left(x + 1).
+        int8x16_t neg_sign_left = vdupq_n_s8(x265_signOf(rec-1 - rec0));
+        for (int x = 0; x < endX; x += 16)
+        {
+            int8x16_t sign_right = signOf_neon(rec + x, rec + x + 1);
+
+            // neg_sign_left(x) = sign_right(x + 1), reusing one from previous
+            // iteration.
+            neg_sign_left = vextq_s8(neg_sign_left, sign_right, 15);
+
+            // Subtract instead of add, as sign_left is negated.
+            int8x16_t edge_type = vsubq_s8(sign_right, neg_sign_left);
+
+            // For reuse in the next iteration.
+            neg_sign_left = sign_right;
+
+            edge_type = x265_sve_mask(x, endX, edge_type);
+            count_acc_u8 = vaddq_u8(count_acc_u8, sve_count(edge_type));
+            compute_eo_stats(edge_type, diff + x, tmp_stats);
+        }
+
+        // The width (endX) can be a maximum of 64, so we can safely
+        // widen from 8-bit count accumulators after one inner loop iteration.
+        // Technically the largest an accumulator could reach after one inner
+        // loop iteration is 64, if every input value had the same edge type, so
+        // we could complete two iterations (2 * 64 = 128) before widening.
+        count_acc_u16 = vaddw_u8(count_acc_u16, vget_low_u8(count_acc_u8));
+
+        diff += MAX_CU_SIZE;
+        rec += stride;
+    }
+
+    reduce_eo_stats(tmp_stats, count_acc_u16, stats, count);
+}
+
+void saoCuStatsE1_sve2(const int16_t *diff, const pixel *rec, intptr_t stride,
+                       int8_t *upBuff1, int endX, int endY, int32_t *stats,
+                       int32_t *count)
+{
+    // Separate buffers for each edge type, so that we can vectorise.
+    int64x2_t tmp_stats5 = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0),
+                               vdupq_n_s64(0), vdupq_n_s64(0) };
+    uint16x8_t count_acc_u16 = vdupq_n_u16(0);
+
+    // Negate upBuff1 (sign_up), so we can subtract and save repeated negations.
+    for (int x = 0; x < endX; x += 16)
+    {
+        vst1q_s8(upBuff1 + x, vnegq_s8(vld1q_s8(upBuff1 + x)));
+    }
+
+    for (int y = 0; y < endY; y++)
+    {
+        uint8x16_t count_acc_u8 = vdupq_n_u8(0);
+
+        for (int x = 0; x < endX; x += 16)
+        {
+            int8x16_t sign_up = vld1q_s8(upBuff1 + x);
+            int8x16_t sign_down = signOf_neon(rec + x, rec + x + stride);
+
+            // Subtract instead of add, as sign_up is negated.
+            int8x16_t edge_type = vsubq_s8(sign_down, sign_up);
+
+            // For reuse in the next iteration.
+            vst1q_s8(upBuff1 + x, sign_down);
+
+            edge_type = x265_sve_mask(x, endX, edge_type);
+            count_acc_u8 = vaddq_u8(count_acc_u8, sve_count(edge_type));
+            compute_eo_stats(edge_type, diff + x, tmp_stats);
+        }
+
+        // The width (endX) can be a maximum of 64, so we can safely
+        // widen from 8-bit count accumulators after one inner loop iteration.
+        // Technically the largest an accumulator could reach after one inner
+        // loop iteration is 64, if every input value had the same edge type, so
+        // we could complete two iterations (2 * 64 = 128) before widening.
+        count_acc_u16 = vaddw_u8(count_acc_u16, vget_low_u8(count_acc_u8));
+
​

x265_4.0.tar.gz/source/common/aarch64/sao-prim.cpp Added

@@ -0,0 +1,380 @@
+/*****************************************************************************
+ * Copyright (C) 2024 MulticoreWare, Inc
+ *
+ * Authors: Hari Limaye <hari.limaye@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "sao-prim.h"
+#include "sao.h"
+#include <arm_neon.h>
+
+// Predicate mask indices.
+static const int8_t quad_reg_byte_indices16 = {
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+};
+
+static inline int8x16_t mask_inactive_elems(const int rem, int8x16_t edge_type)
+{
+    // Compute a predicate mask where the bits of an element are 0 if the index
+    // is less than the remainder (active), and 1 otherwise.
+    const int8x16_t indices = vld1q_s8(quad_reg_byte_indices);
+    int8x16_t pred = vreinterpretq_s8_u8(vcgeq_s8(indices, vdupq_n_s8(rem)));
+
+    // Use predicate mask to shift "unused lanes" outside of range -2, 2
+    pred = vshlq_n_s8(pred, 3);
+    return veorq_s8(edge_type, pred);
+}
+
+/*
+ * Compute Edge Offset statistics (count and stats).
+ * To save some instructions compute count and stats as negative values - since
+ * output of Neon comparison instructions for a matched condition is all 1s (-1).
+ */
+static inline void compute_eo_stats(const int8x16_t edge_type,
+                                    const int16_t *diff, int16x8_t *count,
+                                    int32x4_t *stats)
+{
+    // Create a mask for each edge type.
+    int8x16_t mask0 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(-2)));
+    int8x16_t mask1 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(-1)));
+    int8x16_t mask2 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(0)));
+    int8x16_t mask3 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(1)));
+    int8x16_t mask4 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(2)));
+
+    // Compute negative counts for each edge type.
+    count0 = vpadalq_s8(count0, mask0);
+    count1 = vpadalq_s8(count1, mask1);
+    count2 = vpadalq_s8(count2, mask2);
+    count3 = vpadalq_s8(count3, mask3);
+    count4 = vpadalq_s8(count4, mask4);
+
+    // Widen the masks to 16-bit.
+    int16x8_t mask0_lo = vreinterpretq_s16_s8(vzip1q_s8(mask0, mask0));
+    int16x8_t mask0_hi = vreinterpretq_s16_s8(vzip2q_s8(mask0, mask0));
+    int16x8_t mask1_lo = vreinterpretq_s16_s8(vzip1q_s8(mask1, mask1));
+    int16x8_t mask1_hi = vreinterpretq_s16_s8(vzip2q_s8(mask1, mask1));
+    int16x8_t mask2_lo = vreinterpretq_s16_s8(vzip1q_s8(mask2, mask2));
+    int16x8_t mask2_hi = vreinterpretq_s16_s8(vzip2q_s8(mask2, mask2));
+    int16x8_t mask3_lo = vreinterpretq_s16_s8(vzip1q_s8(mask3, mask3));
+    int16x8_t mask3_hi = vreinterpretq_s16_s8(vzip2q_s8(mask3, mask3));
+    int16x8_t mask4_lo = vreinterpretq_s16_s8(vzip1q_s8(mask4, mask4));
+    int16x8_t mask4_hi = vreinterpretq_s16_s8(vzip2q_s8(mask4, mask4));
+
+    int16x8_t diff_lo = vld1q_s16(diff);
+    int16x8_t diff_hi = vld1q_s16(diff + 8);
+
+    // Compute negative stats for each edge type.
+    int16x8_t stats0 = vmulq_s16(diff_lo, mask0_lo);
+    int16x8_t stats1 = vmulq_s16(diff_lo, mask1_lo);
+    int16x8_t stats2 = vmulq_s16(diff_lo, mask2_lo);
+    int16x8_t stats3 = vmulq_s16(diff_lo, mask3_lo);
+    int16x8_t stats4 = vmulq_s16(diff_lo, mask4_lo);
+    stats0 = vmlaq_s16(stats0, diff_hi, mask0_hi);
+    stats1 = vmlaq_s16(stats1, diff_hi, mask1_hi);
+    stats2 = vmlaq_s16(stats2, diff_hi, mask2_hi);
+    stats3 = vmlaq_s16(stats3, diff_hi, mask3_hi);
+    stats4 = vmlaq_s16(stats4, diff_hi, mask4_hi);
+
+    stats0 = vpadalq_s16(stats0, stats0);
+    stats1 = vpadalq_s16(stats1, stats1);
+    stats2 = vpadalq_s16(stats2, stats2);
+    stats3 = vpadalq_s16(stats3, stats3);
+    stats4 = vpadalq_s16(stats4, stats4);
+}
+
+/*
+ * Reduce and store Edge Offset statistics (count and stats).
+ */
+static inline void reduce_eo_stats(int32x4_t *vstats, int16x8_t *vcount,
+                                   int32_t *stats, int32_t *count)
+{
+    // s_eoTable maps edge types to memory in order: {2, 0, 1, 3, 4}.
+    int16x8_t c01 = vpaddq_s16(vcount2, vcount0);
+    int16x8_t c23 = vpaddq_s16(vcount1, vcount3);
+    int16x8_t c0123 = vpaddq_s16(c01, c23);
+
+    // Subtract from current count, as we calculate the negation.
+    vst1q_s32(count, vsubq_s32(vld1q_s32(count), vpaddlq_s16(c0123)));
+    count4 -= vaddvq_s16(vcount4);
+
+    int32x4_t s01 = vpaddq_s32(vstats2, vstats0);
+    int32x4_t s23 = vpaddq_s32(vstats1, vstats3);
+    int32x4_t s0123 = vpaddq_s32(s01, s23);
+
+    // Subtract from current stats, as we calculate the negation.
+    vst1q_s32(stats, vsubq_s32(vld1q_s32(stats), s0123));
+    stats4 -= vaddvq_s32(vstats4);
+}
+
+namespace X265_NS {
+void saoCuStatsBO_neon(const int16_t *diff, const pixel *rec, intptr_t stride,
+                       int endX, int endY, int32_t *stats, int32_t *count)
+{
+#if HIGH_BIT_DEPTH
+    const int n_elem = 4;
+    const int elem_width = 16;
+#else
+    const int n_elem = 8;
+    const int elem_width = 8;
+#endif
+
+    // Additional temporary buffer for accumulation.
+    int32_t stats_tmp32 = { 0 };
+    int32_t count_tmp32 = { 0 };
+
+    // Byte-addressable pointers to buffers, to optimise address calculation.
+    uint8_t *stats_b2 = {
+        reinterpret_cast<uint8_t *>(stats),
+        reinterpret_cast<uint8_t *>(stats_tmp),
+    };
+    uint8_t *count_b2 = {
+        reinterpret_cast<uint8_t *>(count),
+        reinterpret_cast<uint8_t *>(count_tmp),
+    };
+
+    // Combine shift for index calculation with shift for address calculation.
+    const int right_shift = X265_DEPTH - X265_NS::SAO::SAO_BO_BITS;
+    const int left_shift = 2;
+    const int shift = right_shift - left_shift;
+    // Mask out bits 7, 1 & 0 to account for combination of shifts.
+    const int mask = 0x7c;
+
+    // Compute statistics into temporary buffers.
+    for (int y = 0; y < endY; y++)
+    {
+        int x = 0;
+        for (; x + n_elem < endX; x += n_elem)
+        {
+            uint64_t class_idx_64 =
+                *reinterpret_cast<const uint64_t *>(rec + x) >> shift;
+
+            for (int i = 0; i < n_elem; ++i)
+            {
+                const int idx = i & 1;
+                const int off  = (class_idx_64 >> (i * elem_width)) & mask;
+                *reinterpret_cast<uint32_t*>(stats_bidx + off) += diffx + i;
+                *reinterpret_cast<uint32_t*>(count_bidx + off) += 1;
+            }
+        }
+
+        if (x < endX)
+        {
+            uint64_t class_idx_64 =
+                *reinterpret_cast<const uint64_t *>(rec + x) >> shift;
+
+            for (int i = 0; (i + x) < endX; ++i)
+            {
+                const int idx = i & 1;
+                const int off  = (class_idx_64 >> (i * elem_width)) & mask;
+                *reinterpret_cast<uint32_t*>(stats_bidx + off) += diffx + i;
+                *reinterpret_cast<uint32_t*>(count_bidx + off) += 1;
+            }
+        }
+
+        diff += MAX_CU_SIZE;
+        rec += stride;
+    }
+
+    // Reduce temporary buffers to destination using Neon.
+    for (int i = 0; i < 32; i += 4)
+    {
+        int32x4_t s0 = vld1q_s32(stats_tmp + i);
+        int32x4_t s1 = vld1q_s32(stats + i);

 
@@ -0,0 +1,380 @@
+/*****************************************************************************
+ * Copyright (C) 2024 MulticoreWare, Inc
+ *
+ * Authors: Hari Limaye <hari.limaye@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "sao-prim.h"
+#include "sao.h"
+#include <arm_neon.h>
+
+// Predicate mask indices.
+static const int8_t quad_reg_byte_indices16 = {
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+};
+
+static inline int8x16_t mask_inactive_elems(const int rem, int8x16_t edge_type)
+{
+    // Compute a predicate mask where the bits of an element are 0 if the index
+    // is less than the remainder (active), and 1 otherwise.
+    const int8x16_t indices = vld1q_s8(quad_reg_byte_indices);
+    int8x16_t pred = vreinterpretq_s8_u8(vcgeq_s8(indices, vdupq_n_s8(rem)));
+
+    // Use predicate mask to shift "unused lanes" outside of range -2, 2
+    pred = vshlq_n_s8(pred, 3);
+    return veorq_s8(edge_type, pred);
+}
+
+/*
+ * Compute Edge Offset statistics (count and stats).
+ * To save some instructions compute count and stats as negative values - since
+ * output of Neon comparison instructions for a matched condition is all 1s (-1).
+ */
+static inline void compute_eo_stats(const int8x16_t edge_type,
+                                    const int16_t *diff, int16x8_t *count,
+                                    int32x4_t *stats)
+{
+    // Create a mask for each edge type.
+    int8x16_t mask0 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(-2)));
+    int8x16_t mask1 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(-1)));
+    int8x16_t mask2 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(0)));
+    int8x16_t mask3 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(1)));
+    int8x16_t mask4 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(2)));
+
+    // Compute negative counts for each edge type.
+    count0 = vpadalq_s8(count0, mask0);
+    count1 = vpadalq_s8(count1, mask1);
+    count2 = vpadalq_s8(count2, mask2);
+    count3 = vpadalq_s8(count3, mask3);
+    count4 = vpadalq_s8(count4, mask4);
+
+    // Widen the masks to 16-bit.
+    int16x8_t mask0_lo = vreinterpretq_s16_s8(vzip1q_s8(mask0, mask0));
+    int16x8_t mask0_hi = vreinterpretq_s16_s8(vzip2q_s8(mask0, mask0));
+    int16x8_t mask1_lo = vreinterpretq_s16_s8(vzip1q_s8(mask1, mask1));
+    int16x8_t mask1_hi = vreinterpretq_s16_s8(vzip2q_s8(mask1, mask1));
+    int16x8_t mask2_lo = vreinterpretq_s16_s8(vzip1q_s8(mask2, mask2));
+    int16x8_t mask2_hi = vreinterpretq_s16_s8(vzip2q_s8(mask2, mask2));
+    int16x8_t mask3_lo = vreinterpretq_s16_s8(vzip1q_s8(mask3, mask3));
+    int16x8_t mask3_hi = vreinterpretq_s16_s8(vzip2q_s8(mask3, mask3));
+    int16x8_t mask4_lo = vreinterpretq_s16_s8(vzip1q_s8(mask4, mask4));
+    int16x8_t mask4_hi = vreinterpretq_s16_s8(vzip2q_s8(mask4, mask4));
+
+    int16x8_t diff_lo = vld1q_s16(diff);
+    int16x8_t diff_hi = vld1q_s16(diff + 8);
+
+    // Compute negative stats for each edge type.
+    int16x8_t stats0 = vmulq_s16(diff_lo, mask0_lo);
+    int16x8_t stats1 = vmulq_s16(diff_lo, mask1_lo);
+    int16x8_t stats2 = vmulq_s16(diff_lo, mask2_lo);
+    int16x8_t stats3 = vmulq_s16(diff_lo, mask3_lo);
+    int16x8_t stats4 = vmulq_s16(diff_lo, mask4_lo);
+    stats0 = vmlaq_s16(stats0, diff_hi, mask0_hi);
+    stats1 = vmlaq_s16(stats1, diff_hi, mask1_hi);
+    stats2 = vmlaq_s16(stats2, diff_hi, mask2_hi);
+    stats3 = vmlaq_s16(stats3, diff_hi, mask3_hi);
+    stats4 = vmlaq_s16(stats4, diff_hi, mask4_hi);
+
+    stats0 = vpadalq_s16(stats0, stats0);
+    stats1 = vpadalq_s16(stats1, stats1);
+    stats2 = vpadalq_s16(stats2, stats2);
+    stats3 = vpadalq_s16(stats3, stats3);
+    stats4 = vpadalq_s16(stats4, stats4);
+}
+
+/*
+ * Reduce and store Edge Offset statistics (count and stats).
+ */
+static inline void reduce_eo_stats(int32x4_t *vstats, int16x8_t *vcount,
+                                   int32_t *stats, int32_t *count)
+{
+    // s_eoTable maps edge types to memory in order: {2, 0, 1, 3, 4}.
+    int16x8_t c01 = vpaddq_s16(vcount2, vcount0);
+    int16x8_t c23 = vpaddq_s16(vcount1, vcount3);
+    int16x8_t c0123 = vpaddq_s16(c01, c23);
+
+    // Subtract from current count, as we calculate the negation.
+    vst1q_s32(count, vsubq_s32(vld1q_s32(count), vpaddlq_s16(c0123)));
+    count4 -= vaddvq_s16(vcount4);
+
+    int32x4_t s01 = vpaddq_s32(vstats2, vstats0);
+    int32x4_t s23 = vpaddq_s32(vstats1, vstats3);
+    int32x4_t s0123 = vpaddq_s32(s01, s23);
+
+    // Subtract from current stats, as we calculate the negation.
+    vst1q_s32(stats, vsubq_s32(vld1q_s32(stats), s0123));
+    stats4 -= vaddvq_s32(vstats4);
+}
+
+namespace X265_NS {
+void saoCuStatsBO_neon(const int16_t *diff, const pixel *rec, intptr_t stride,
+                       int endX, int endY, int32_t *stats, int32_t *count)
+{
+#if HIGH_BIT_DEPTH
+    const int n_elem = 4;
+    const int elem_width = 16;
+#else
+    const int n_elem = 8;
+    const int elem_width = 8;
+#endif
+
+    // Additional temporary buffer for accumulation.
+    int32_t stats_tmp32 = { 0 };
+    int32_t count_tmp32 = { 0 };
+
+    // Byte-addressable pointers to buffers, to optimise address calculation.
+    uint8_t *stats_b2 = {
+        reinterpret_cast<uint8_t *>(stats),
+        reinterpret_cast<uint8_t *>(stats_tmp),
+    };
+    uint8_t *count_b2 = {
+        reinterpret_cast<uint8_t *>(count),
+        reinterpret_cast<uint8_t *>(count_tmp),
+    };
+
+    // Combine shift for index calculation with shift for address calculation.
+    const int right_shift = X265_DEPTH - X265_NS::SAO::SAO_BO_BITS;
+    const int left_shift = 2;
+    const int shift = right_shift - left_shift;
+    // Mask out bits 7, 1 & 0 to account for combination of shifts.
+    const int mask = 0x7c;
+
+    // Compute statistics into temporary buffers.
+    for (int y = 0; y < endY; y++)
+    {
+        int x = 0;
+        for (; x + n_elem < endX; x += n_elem)
+        {
+            uint64_t class_idx_64 =
+                *reinterpret_cast<const uint64_t *>(rec + x) >> shift;
+
+            for (int i = 0; i < n_elem; ++i)
+            {
+                const int idx = i & 1;
+                const int off  = (class_idx_64 >> (i * elem_width)) & mask;
+                *reinterpret_cast<uint32_t*>(stats_bidx + off) += diffx + i;
+                *reinterpret_cast<uint32_t*>(count_bidx + off) += 1;
+            }
+        }
+
+        if (x < endX)
+        {
+            uint64_t class_idx_64 =
+                *reinterpret_cast<const uint64_t *>(rec + x) >> shift;
+
+            for (int i = 0; (i + x) < endX; ++i)
+            {
+                const int idx = i & 1;
+                const int off  = (class_idx_64 >> (i * elem_width)) & mask;
+                *reinterpret_cast<uint32_t*>(stats_bidx + off) += diffx + i;
+                *reinterpret_cast<uint32_t*>(count_bidx + off) += 1;
+            }
+        }
+
+        diff += MAX_CU_SIZE;
+        rec += stride;
+    }
+
+    // Reduce temporary buffers to destination using Neon.
+    for (int i = 0; i < 32; i += 4)
+    {
+        int32x4_t s0 = vld1q_s32(stats_tmp + i);
+        int32x4_t s1 = vld1q_s32(stats + i);
​

x265_4.0.tar.gz/source/common/aarch64/sao-prim.h Added

@@ -0,0 +1,70 @@
+/*****************************************************************************
+ * Copyright (C) 2024 MulticoreWare, Inc
+ *
+ * Authors: Hari Limaye <hari.limaye@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_COMMON_AARCH64_SAO_PRIM_H
+#define X265_COMMON_AARCH64_SAO_PRIM_H
+
+#include "neon-sve-bridge.h"
+#include "primitives.h"
+#include <arm_neon.h>
+
+static inline int8x16_t signOf_neon(const pixel *a, const pixel *b)
+{
+#if HIGH_BIT_DEPTH
+    uint16x8_t s0_lo = vld1q_u16(a);
+    uint16x8_t s0_hi = vld1q_u16(a + 8);
+    uint16x8_t s1_lo = vld1q_u16(b);
+    uint16x8_t s1_hi = vld1q_u16(b + 8);
+
+    // signOf(a - b) = -(a > b ? -1 : 0) | (a < b ? -1 : 0)
+    int16x8_t cmp0_lo = vreinterpretq_s16_u16(vcgtq_u16(s0_lo, s1_lo));
+    int16x8_t cmp0_hi = vreinterpretq_s16_u16(vcgtq_u16(s0_hi, s1_hi));
+    int16x8_t cmp1_lo = vreinterpretq_s16_u16(vcgtq_u16(s1_lo, s0_lo));
+    int16x8_t cmp1_hi = vreinterpretq_s16_u16(vcgtq_u16(s1_hi, s0_hi));
+
+    int8x16_t cmp0 = vcombine_s8(vmovn_s16(cmp0_lo), vmovn_s16(cmp0_hi));
+    int8x16_t cmp1 = vcombine_s8(vmovn_s16(cmp1_lo), vmovn_s16(cmp1_hi));
+#else // HIGH_BIT_DEPTH
+    uint8x16_t s0 = vld1q_u8(a);
+    uint8x16_t s1 = vld1q_u8(b);
+
+    // signOf(a - b) = -(a > b ? -1 : 0) | (a < b ? -1 : 0)
+    int8x16_t cmp0 = vreinterpretq_s8_u8(vcgtq_u8(s0, s1));
+    int8x16_t cmp1 = vreinterpretq_s8_u8(vcgtq_u8(s1, s0));
+#endif // HIGH_BIT_DEPTH
+    return vorrq_s8(vnegq_s8(cmp0), cmp1);
+}
+
+namespace X265_NS {
+void setupSaoPrimitives_neon(EncoderPrimitives &p);
+
+#if defined(HAVE_SVE) && HAVE_SVE_BRIDGE
+void setupSaoPrimitives_sve(EncoderPrimitives &p);
+#endif
+
+#if defined(HAVE_SVE2) && HAVE_SVE_BRIDGE
+void setupSaoPrimitives_sve2(EncoderPrimitives &p);
+#endif
+}
+
+#endif // X265_COMMON_AARCH64_SAO_PRIM_H

 
@@ -0,0 +1,70 @@
+/*****************************************************************************
+ * Copyright (C) 2024 MulticoreWare, Inc
+ *
+ * Authors: Hari Limaye <hari.limaye@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_COMMON_AARCH64_SAO_PRIM_H
+#define X265_COMMON_AARCH64_SAO_PRIM_H
+
+#include "neon-sve-bridge.h"
+#include "primitives.h"
+#include <arm_neon.h>
+
+static inline int8x16_t signOf_neon(const pixel *a, const pixel *b)
+{
+#if HIGH_BIT_DEPTH
+    uint16x8_t s0_lo = vld1q_u16(a);
+    uint16x8_t s0_hi = vld1q_u16(a + 8);
+    uint16x8_t s1_lo = vld1q_u16(b);
+    uint16x8_t s1_hi = vld1q_u16(b + 8);
+
+    // signOf(a - b) = -(a > b ? -1 : 0) | (a < b ? -1 : 0)
+    int16x8_t cmp0_lo = vreinterpretq_s16_u16(vcgtq_u16(s0_lo, s1_lo));
+    int16x8_t cmp0_hi = vreinterpretq_s16_u16(vcgtq_u16(s0_hi, s1_hi));
+    int16x8_t cmp1_lo = vreinterpretq_s16_u16(vcgtq_u16(s1_lo, s0_lo));
+    int16x8_t cmp1_hi = vreinterpretq_s16_u16(vcgtq_u16(s1_hi, s0_hi));
+
+    int8x16_t cmp0 = vcombine_s8(vmovn_s16(cmp0_lo), vmovn_s16(cmp0_hi));
+    int8x16_t cmp1 = vcombine_s8(vmovn_s16(cmp1_lo), vmovn_s16(cmp1_hi));
+#else // HIGH_BIT_DEPTH
+    uint8x16_t s0 = vld1q_u8(a);
+    uint8x16_t s1 = vld1q_u8(b);
+
+    // signOf(a - b) = -(a > b ? -1 : 0) | (a < b ? -1 : 0)
+    int8x16_t cmp0 = vreinterpretq_s8_u8(vcgtq_u8(s0, s1));
+    int8x16_t cmp1 = vreinterpretq_s8_u8(vcgtq_u8(s1, s0));
+#endif // HIGH_BIT_DEPTH
+    return vorrq_s8(vnegq_s8(cmp0), cmp1);
+}
+
+namespace X265_NS {
+void setupSaoPrimitives_neon(EncoderPrimitives &p);
+
+#if defined(HAVE_SVE) && HAVE_SVE_BRIDGE
+void setupSaoPrimitives_sve(EncoderPrimitives &p);
+#endif
+
+#if defined(HAVE_SVE2) && HAVE_SVE_BRIDGE
+void setupSaoPrimitives_sve2(EncoderPrimitives &p);
+#endif
+}
+
+#endif // X265_COMMON_AARCH64_SAO_PRIM_H
​

x265_3.6.tar.gz/source/common/aarch64/ssd-a-common.S -> x265_4.0.tar.gz/source/common/aarch64/ssd-a-common.S Changed

 
@@ -29,9 +29,7 @@
 .arch           armv8-a
 
 .macro ret_v0_w0
-    trn2            v1.2d, v0.2d, v0.2d
-    add             v0.2s, v0.2s, v1.2s
-    addp            v0.2s, v0.2s, v0.2s
+    addv            s0, v0.4s
     fmov            w0, s0
     ret
 .endm
​

x265_3.6.tar.gz/source/common/aarch64/ssd-a-sve2.S -> x265_4.0.tar.gz/source/common/aarch64/ssd-a-sve2.S Changed

@@ -36,267 +36,6 @@
 
 .text
 
-function PFX(pixel_sse_pp_32x32_sve2)
-    rdvl            x9, #1
-    cmp             x9, #16
-    bgt             .vl_gt_16_pixel_sse_pp_32x32
-    mov             w12, #8
-    movi            v0.16b, #0
-    movi            v1.16b, #0
-.loop_sse_pp_32_sve2:
-    sub             w12, w12, #1
-.rept 4
-    ld1             {v16.16b,v17.16b}, x0, x1
-    ld1             {v18.16b,v19.16b}, x2, x3
-    usubl           v2.8h, v16.8b, v18.8b
-    usubl2          v3.8h, v16.16b, v18.16b
-    usubl           v4.8h, v17.8b, v19.8b
-    usubl2          v5.8h, v17.16b, v19.16b
-    smlal           v0.4s, v2.4h, v2.4h
-    smlal2          v1.4s, v2.8h, v2.8h
-    smlal           v0.4s, v3.4h, v3.4h
-    smlal2          v1.4s, v3.8h, v3.8h
-    smlal           v0.4s, v4.4h, v4.4h
-    smlal2          v1.4s, v4.8h, v4.8h
-    smlal           v0.4s, v5.4h, v5.4h
-    smlal2          v1.4s, v5.8h, v5.8h
-.endr
-    cbnz            w12, .loop_sse_pp_32_sve2
-    add             v0.4s, v0.4s, v1.4s
-    ret_v0_w0
-.vl_gt_16_pixel_sse_pp_32x32:
-    ptrue           p0.b, vl32
-    ld1b            {z16.b}, p0/z, x0
-    ld1b            {z18.b}, p0/z, x2
-    add             x0, x0, x1
-    add             x2, x2, x3
-    usublb          z1.h, z16.b, z18.b
-    usublt          z2.h, z16.b, z18.b
-    smullb          z0.s, z1.h, z1.h
-    smlalt          z0.s, z1.h, z1.h
-    smlalb          z0.s, z2.h, z2.h
-    smlalt          z0.s, z2.h, z2.h
-.rept 31
-    ld1b            {z16.b}, p0/z, x0
-    ld1b            {z18.b}, p0/z, x2
-    add             x0, x0, x1
-    add             x2, x2, x3
-    usublb          z1.h, z16.b, z18.b
-    usublt          z2.h, z16.b, z18.b
-    smullb          z0.s, z1.h, z1.h
-    smlalt          z0.s, z1.h, z1.h
-    smlalb          z0.s, z2.h, z2.h
-    smlalt          z0.s, z2.h, z2.h
-.endr
-    uaddv           d3, p0, z0.s
-    fmov            w0, s3
-    ret
-endfunc
-
-function PFX(pixel_sse_pp_32x64_sve2)
-    rdvl            x9, #1
-    cmp             x9, #16
-    bgt             .vl_gt_16_pixel_sse_pp_32x64
-    ptrue           p0.b, vl16
-    ld1b            {z16.b}, p0/z, x0
-    ld1b            {z17.b}, p0/z, x0, #1, mul vl
-    ld1b            {z18.b}, p0/z, x2
-    ld1b            {z19.b}, p0/z, x2, #1, mul vl
-    add             x0, x0, x1
-    add             x2, x2, x3
-    usublb          z1.h, z16.b, z18.b
-    usublt          z2.h, z16.b, z18.b
-    usublb          z3.h, z17.b, z19.b
-    usublt          z4.h, z17.b, z19.b
-    smullb          z20.s, z1.h, z1.h
-    smullt          z21.s, z1.h, z1.h
-    smlalb          z20.s, z2.h, z2.h
-    smlalt          z21.s, z2.h, z2.h
-    smlalb          z20.s, z3.h, z3.h
-    smlalt          z21.s, z3.h, z3.h
-    smlalb          z20.s, z4.h, z4.h
-    smlalt          z21.s, z4.h, z4.h
-.rept 63
-    ld1b            {z16.b}, p0/z, x0
-    ld1b            {z17.b}, p0/z, x0, #1, mul vl
-    ld1b            {z18.b}, p0/z, x2
-    ld1b            {z19.b}, p0/z, x2, #1, mul vl
-    add             x0, x0, x1
-    add             x2, x2, x3
-    usublb          z1.h, z16.b, z18.b
-    usublt          z2.h, z16.b, z18.b
-    usublb          z3.h, z17.b, z19.b
-    usublt          z4.h, z17.b, z19.b
-    smlalb          z20.s, z1.h, z1.h
-    smlalt          z21.s, z1.h, z1.h
-    smlalb          z20.s, z2.h, z2.h
-    smlalt          z21.s, z2.h, z2.h
-    smlalb          z20.s, z3.h, z3.h
-    smlalt          z21.s, z3.h, z3.h
-    smlalb          z20.s, z4.h, z4.h
-    smlalt          z21.s, z4.h, z4.h
-.endr
-    uaddv           d3, p0, z20.s
-    fmov            w0, s3
-    uaddv           d4, p0, z21.s
-    fmov            w1, s4
-    add             w0, w0, w1
-    ret
-.vl_gt_16_pixel_sse_pp_32x64:
-    ptrue           p0.b, vl32
-    ld1b            {z16.b}, p0/z, x0
-    ld1b            {z18.b}, p0/z, x2
-    add             x0, x0, x1
-    add             x2, x2, x3
-    usublb          z1.h, z16.b, z18.b
-    usublt          z2.h, z16.b, z18.b
-    smullb          z20.s, z1.h, z1.h
-    smullt          z21.s, z1.h, z1.h
-    smlalb          z20.s, z2.h, z2.h
-    smlalt          z21.s, z2.h, z2.h
-.rept 63
-    ld1b            {z16.b}, p0/z, x0
-    ld1b            {z18.b}, p0/z, x2
-    add             x0, x0, x1
-    add             x2, x2, x3
-    usublb          z1.h, z16.b, z18.b
-    usublt          z2.h, z16.b, z18.b
-    smlalb          z20.s, z1.h, z1.h
-    smlalt          z21.s, z1.h, z1.h
-    smlalb          z20.s, z2.h, z2.h
-    smlalt          z21.s, z2.h, z2.h
-.endr
-    uaddv           d3, p0, z20.s
-    fmov            w0, s3
-    uaddv           d4, p0, z21.s
-    fmov            w1, s4
-    add             w0, w0, w1
-    ret
-endfunc
-
-function PFX(pixel_sse_pp_64x64_sve2)
-    rdvl            x9, #1
-    cmp             x9, #16
-    bgt             .vl_gt_16_pixel_sse_pp_64x64
-    mov             w12, #16
-    movi            v0.16b, #0
-    movi            v1.16b, #0
-
-.loop_sse_pp_64_sve2:
-    sub             w12, w12, #1
-.rept 4
-    ld1             {v16.16b-v19.16b}, x0, x1
-    ld1             {v20.16b-v23.16b}, x2, x3
-
-    usubl           v2.8h, v16.8b, v20.8b
-    usubl2          v3.8h, v16.16b, v20.16b
-    usubl           v4.8h, v17.8b, v21.8b
-    usubl2          v5.8h, v17.16b, v21.16b
-    smlal           v0.4s, v2.4h, v2.4h
-    smlal2          v1.4s, v2.8h, v2.8h
-    smlal           v0.4s, v3.4h, v3.4h
-    smlal2          v1.4s, v3.8h, v3.8h
-    smlal           v0.4s, v4.4h, v4.4h
-    smlal2          v1.4s, v4.8h, v4.8h
-    smlal           v0.4s, v5.4h, v5.4h
-    smlal2          v1.4s, v5.8h, v5.8h
-
-    usubl           v2.8h, v18.8b, v22.8b
-    usubl2          v3.8h, v18.16b, v22.16b
-    usubl           v4.8h, v19.8b, v23.8b
-    usubl2          v5.8h, v19.16b, v23.16b
-    smlal           v0.4s, v2.4h, v2.4h
-    smlal2          v1.4s, v2.8h, v2.8h
-    smlal           v0.4s, v3.4h, v3.4h
-    smlal2          v1.4s, v3.8h, v3.8h
-    smlal           v0.4s, v4.4h, v4.4h
-    smlal2          v1.4s, v4.8h, v4.8h
-    smlal           v0.4s, v5.4h, v5.4h
-    smlal2          v1.4s, v5.8h, v5.8h
-.endr
-    cbnz            w12, .loop_sse_pp_64_sve2
-    add             v0.4s, v0.4s, v1.4s
-    ret_v0_w0
-.vl_gt_16_pixel_sse_pp_64x64:
-    cmp             x9, #48
-    bgt             .vl_gt_48_pixel_sse_pp_64x64
-    ptrue           p0.b, vl32
-    ld1b            {z16.b}, p0/z, x0
-    ld1b            {z17.b}, p0/z, x0, #1, mul vl
-    ld1b            {z20.b}, p0/z, x2
-    ld1b            {z21.b}, p0/z, x2, #1, mul vl
-    add             x0, x0, x1
-    add             x2, x2, x3
-    usublb          z1.h, z16.b, z20.b
-    usublt          z2.h, z16.b, z20.b
-    usublb          z3.h, z17.b, z21.b
-    usublt          z4.h, z17.b, z21.b
-    smullb          z24.s, z1.h, z1.h

 
@@ -36,267 +36,6 @@
 
 .text
 
-function PFX(pixel_sse_pp_32x32_sve2)
-    rdvl            x9, #1
-    cmp             x9, #16
-    bgt             .vl_gt_16_pixel_sse_pp_32x32
-    mov             w12, #8
-    movi            v0.16b, #0
-    movi            v1.16b, #0
-.loop_sse_pp_32_sve2:
-    sub             w12, w12, #1
-.rept 4
-    ld1             {v16.16b,v17.16b}, x0, x1
-    ld1             {v18.16b,v19.16b}, x2, x3
-    usubl           v2.8h, v16.8b, v18.8b
-    usubl2          v3.8h, v16.16b, v18.16b
-    usubl           v4.8h, v17.8b, v19.8b
-    usubl2          v5.8h, v17.16b, v19.16b
-    smlal           v0.4s, v2.4h, v2.4h
-    smlal2          v1.4s, v2.8h, v2.8h
-    smlal           v0.4s, v3.4h, v3.4h
-    smlal2          v1.4s, v3.8h, v3.8h
-    smlal           v0.4s, v4.4h, v4.4h
-    smlal2          v1.4s, v4.8h, v4.8h
-    smlal           v0.4s, v5.4h, v5.4h
-    smlal2          v1.4s, v5.8h, v5.8h
-.endr
-    cbnz            w12, .loop_sse_pp_32_sve2
-    add             v0.4s, v0.4s, v1.4s
-    ret_v0_w0
-.vl_gt_16_pixel_sse_pp_32x32:
-    ptrue           p0.b, vl32
-    ld1b            {z16.b}, p0/z, x0
-    ld1b            {z18.b}, p0/z, x2
-    add             x0, x0, x1
-    add             x2, x2, x3
-    usublb          z1.h, z16.b, z18.b
-    usublt          z2.h, z16.b, z18.b
-    smullb          z0.s, z1.h, z1.h
-    smlalt          z0.s, z1.h, z1.h
-    smlalb          z0.s, z2.h, z2.h
-    smlalt          z0.s, z2.h, z2.h
-.rept 31
-    ld1b            {z16.b}, p0/z, x0
-    ld1b            {z18.b}, p0/z, x2
-    add             x0, x0, x1
-    add             x2, x2, x3
-    usublb          z1.h, z16.b, z18.b
-    usublt          z2.h, z16.b, z18.b
-    smullb          z0.s, z1.h, z1.h
-    smlalt          z0.s, z1.h, z1.h
-    smlalb          z0.s, z2.h, z2.h
-    smlalt          z0.s, z2.h, z2.h
-.endr
-    uaddv           d3, p0, z0.s
-    fmov            w0, s3
-    ret
-endfunc
-
-function PFX(pixel_sse_pp_32x64_sve2)
-    rdvl            x9, #1
-    cmp             x9, #16
-    bgt             .vl_gt_16_pixel_sse_pp_32x64
-    ptrue           p0.b, vl16
-    ld1b            {z16.b}, p0/z, x0
-    ld1b            {z17.b}, p0/z, x0, #1, mul vl
-    ld1b            {z18.b}, p0/z, x2
-    ld1b            {z19.b}, p0/z, x2, #1, mul vl
-    add             x0, x0, x1
-    add             x2, x2, x3
-    usublb          z1.h, z16.b, z18.b
-    usublt          z2.h, z16.b, z18.b
-    usublb          z3.h, z17.b, z19.b
-    usublt          z4.h, z17.b, z19.b
-    smullb          z20.s, z1.h, z1.h
-    smullt          z21.s, z1.h, z1.h
-    smlalb          z20.s, z2.h, z2.h
-    smlalt          z21.s, z2.h, z2.h
-    smlalb          z20.s, z3.h, z3.h
-    smlalt          z21.s, z3.h, z3.h
-    smlalb          z20.s, z4.h, z4.h
-    smlalt          z21.s, z4.h, z4.h
-.rept 63
-    ld1b            {z16.b}, p0/z, x0
-    ld1b            {z17.b}, p0/z, x0, #1, mul vl
-    ld1b            {z18.b}, p0/z, x2
-    ld1b            {z19.b}, p0/z, x2, #1, mul vl
-    add             x0, x0, x1
-    add             x2, x2, x3
-    usublb          z1.h, z16.b, z18.b
-    usublt          z2.h, z16.b, z18.b
-    usublb          z3.h, z17.b, z19.b
-    usublt          z4.h, z17.b, z19.b
-    smlalb          z20.s, z1.h, z1.h
-    smlalt          z21.s, z1.h, z1.h
-    smlalb          z20.s, z2.h, z2.h
-    smlalt          z21.s, z2.h, z2.h
-    smlalb          z20.s, z3.h, z3.h
-    smlalt          z21.s, z3.h, z3.h
-    smlalb          z20.s, z4.h, z4.h
-    smlalt          z21.s, z4.h, z4.h
-.endr
-    uaddv           d3, p0, z20.s
-    fmov            w0, s3
-    uaddv           d4, p0, z21.s
-    fmov            w1, s4
-    add             w0, w0, w1
-    ret
-.vl_gt_16_pixel_sse_pp_32x64:
-    ptrue           p0.b, vl32
-    ld1b            {z16.b}, p0/z, x0
-    ld1b            {z18.b}, p0/z, x2
-    add             x0, x0, x1
-    add             x2, x2, x3
-    usublb          z1.h, z16.b, z18.b
-    usublt          z2.h, z16.b, z18.b
-    smullb          z20.s, z1.h, z1.h
-    smullt          z21.s, z1.h, z1.h
-    smlalb          z20.s, z2.h, z2.h
-    smlalt          z21.s, z2.h, z2.h
-.rept 63
-    ld1b            {z16.b}, p0/z, x0
-    ld1b            {z18.b}, p0/z, x2
-    add             x0, x0, x1
-    add             x2, x2, x3
-    usublb          z1.h, z16.b, z18.b
-    usublt          z2.h, z16.b, z18.b
-    smlalb          z20.s, z1.h, z1.h
-    smlalt          z21.s, z1.h, z1.h
-    smlalb          z20.s, z2.h, z2.h
-    smlalt          z21.s, z2.h, z2.h
-.endr
-    uaddv           d3, p0, z20.s
-    fmov            w0, s3
-    uaddv           d4, p0, z21.s
-    fmov            w1, s4
-    add             w0, w0, w1
-    ret
-endfunc
-
-function PFX(pixel_sse_pp_64x64_sve2)
-    rdvl            x9, #1
-    cmp             x9, #16
-    bgt             .vl_gt_16_pixel_sse_pp_64x64
-    mov             w12, #16
-    movi            v0.16b, #0
-    movi            v1.16b, #0
-
-.loop_sse_pp_64_sve2:
-    sub             w12, w12, #1
-.rept 4
-    ld1             {v16.16b-v19.16b}, x0, x1
-    ld1             {v20.16b-v23.16b}, x2, x3
-
-    usubl           v2.8h, v16.8b, v20.8b
-    usubl2          v3.8h, v16.16b, v20.16b
-    usubl           v4.8h, v17.8b, v21.8b
-    usubl2          v5.8h, v17.16b, v21.16b
-    smlal           v0.4s, v2.4h, v2.4h
-    smlal2          v1.4s, v2.8h, v2.8h
-    smlal           v0.4s, v3.4h, v3.4h
-    smlal2          v1.4s, v3.8h, v3.8h
-    smlal           v0.4s, v4.4h, v4.4h
-    smlal2          v1.4s, v4.8h, v4.8h
-    smlal           v0.4s, v5.4h, v5.4h
-    smlal2          v1.4s, v5.8h, v5.8h
-
-    usubl           v2.8h, v18.8b, v22.8b
-    usubl2          v3.8h, v18.16b, v22.16b
-    usubl           v4.8h, v19.8b, v23.8b
-    usubl2          v5.8h, v19.16b, v23.16b
-    smlal           v0.4s, v2.4h, v2.4h
-    smlal2          v1.4s, v2.8h, v2.8h
-    smlal           v0.4s, v3.4h, v3.4h
-    smlal2          v1.4s, v3.8h, v3.8h
-    smlal           v0.4s, v4.4h, v4.4h
-    smlal2          v1.4s, v4.8h, v4.8h
-    smlal           v0.4s, v5.4h, v5.4h
-    smlal2          v1.4s, v5.8h, v5.8h
-.endr
-    cbnz            w12, .loop_sse_pp_64_sve2
-    add             v0.4s, v0.4s, v1.4s
-    ret_v0_w0
-.vl_gt_16_pixel_sse_pp_64x64:
-    cmp             x9, #48
-    bgt             .vl_gt_48_pixel_sse_pp_64x64
-    ptrue           p0.b, vl32
-    ld1b            {z16.b}, p0/z, x0
-    ld1b            {z17.b}, p0/z, x0, #1, mul vl
-    ld1b            {z20.b}, p0/z, x2
-    ld1b            {z21.b}, p0/z, x2, #1, mul vl
-    add             x0, x0, x1
-    add             x2, x2, x3
-    usublb          z1.h, z16.b, z20.b
-    usublt          z2.h, z16.b, z20.b
-    usublb          z3.h, z17.b, z21.b
-    usublt          z4.h, z17.b, z21.b
-    smullb          z24.s, z1.h, z1.h
​

x265_3.6.tar.gz/source/common/aarch64/ssd-a.S -> x265_4.0.tar.gz/source/common/aarch64/ssd-a.S Changed

@@ -2,6 +2,7 @@
  * Copyright (C) 2021 MulticoreWare, Inc
  *
  * Authors: Sebastian Pop <spop@amazon.com>
+ *          Hari Limaye <hari.limaye@arm.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -34,217 +35,145 @@
 
 .text
 
-function PFX(pixel_sse_pp_4x4_neon)
-    ld1             {v16.s}0, x0, x1
-    ld1             {v17.s}0, x2, x3
-    ld1             {v18.s}0, x0, x1
-    ld1             {v19.s}0, x2, x3
-    ld1             {v20.s}0, x0, x1
-    ld1             {v21.s}0, x2, x3
-    ld1             {v22.s}0, x0, x1
-    ld1             {v23.s}0, x2, x3
-
-    usubl           v1.8h, v16.8b, v17.8b
-    usubl           v2.8h, v18.8b, v19.8b
-    usubl           v3.8h, v20.8b, v21.8b
-    usubl           v4.8h, v22.8b, v23.8b
-
-    smull           v0.4s, v1.4h, v1.4h
-    smlal           v0.4s, v2.4h, v2.4h
-    smlal           v0.4s, v3.4h, v3.4h
-    smlal           v0.4s, v4.4h, v4.4h
-    ret_v0_w0
-endfunc
+// Fully unrolled.
+.macro SSE_PP_4xN h
+function PFX(pixel_sse_pp_4x\h\()_neon)
+    movi            v0.4s, #0
+.rept \h / 2
+    ldr             s16, x0
+    ldr             s17, x2
+    add             x0, x0, x1
+    add             x2, x2, x3
+    ld1             {v16.s}1, x0, x1
+    ld1             {v17.s}1, x2, x3
 
-function PFX(pixel_sse_pp_4x8_neon)
-    ld1             {v16.s}0, x0, x1
-    ld1             {v17.s}0, x2, x3
-    usubl           v1.8h, v16.8b, v17.8b
-    ld1             {v16.s}0, x0, x1
-    ld1             {v17.s}0, x2, x3
-    smull           v0.4s, v1.4h, v1.4h
-.rept 6
-    usubl           v1.8h, v16.8b, v17.8b
-    ld1             {v16.s}0, x0, x1
-    smlal           v0.4s, v1.4h, v1.4h
-    ld1             {v17.s}0, x2, x3
+    uabd            v1.8b, v16.8b, v17.8b
+    umull           v20.8h, v1.8b, v1.8b
+    uadalp          v0.4s, v20.8h
 .endr
-    usubl           v1.8h, v16.8b, v17.8b
-    smlal           v0.4s, v1.4h, v1.4h
     ret_v0_w0
 endfunc
+.endm
 
-function PFX(pixel_sse_pp_8x8_neon)
-    ld1             {v16.8b}, x0, x1
-    ld1             {v17.8b}, x2, x3
-    usubl           v1.8h, v16.8b, v17.8b
-    ld1             {v16.8b}, x0, x1
-    smull           v0.4s, v1.4h, v1.4h
-    smlal2          v0.4s, v1.8h, v1.8h
-    ld1             {v17.8b}, x2, x3
-
-.rept 6
-    usubl           v1.8h, v16.8b, v17.8b
-    ld1             {v16.8b}, x0, x1
-    smlal           v0.4s, v1.4h, v1.4h
-    smlal2          v0.4s, v1.8h, v1.8h
-    ld1             {v17.8b}, x2, x3
-.endr
-    usubl           v1.8h, v16.8b, v17.8b
-    smlal           v0.4s, v1.4h, v1.4h
-    smlal2          v0.4s, v1.8h, v1.8h
-    ret_v0_w0
-endfunc
+SSE_PP_4xN 4
+SSE_PP_4xN 8
 
-function PFX(pixel_sse_pp_8x16_neon)
-    ld1             {v16.8b}, x0, x1
-    ld1             {v17.8b}, x2, x3
-    usubl           v1.8h, v16.8b, v17.8b
+// Fully unrolled.
+.macro SSE_PP_8xN h
+function PFX(pixel_sse_pp_8x\h\()_neon)
+    movi            v0.4s, #0
+.rept \h
     ld1             {v16.8b}, x0, x1
-    smull           v0.4s, v1.4h, v1.4h
-    smlal2          v0.4s, v1.8h, v1.8h
     ld1             {v17.8b}, x2, x3
 
-.rept 14
-    usubl           v1.8h, v16.8b, v17.8b
-    ld1             {v16.8b}, x0, x1
-    smlal           v0.4s, v1.4h, v1.4h
-    smlal2          v0.4s, v1.8h, v1.8h
-    ld1             {v17.8b}, x2, x3
+    uabd            v1.8b, v16.8b, v17.8b
+    umull           v20.8h, v1.8b, v1.8b
+    uadalp          v0.4s, v20.8h
 .endr
-    usubl           v1.8h, v16.8b, v17.8b
-    smlal           v0.4s, v1.4h, v1.4h
-    smlal2          v0.4s, v1.8h, v1.8h
     ret_v0_w0
 endfunc
+.endm
+
+SSE_PP_8xN 8
+SSE_PP_8xN 16
 
-.macro sse_pp_16xN h
+// Fully unrolled.
+.macro SSE_PP_16xN h
 function PFX(pixel_sse_pp_16x\h\()_neon)
+    movi            v0.4s, #0
+    movi            v1.4s, #0
+.rept \h
     ld1             {v16.16b}, x0, x1
     ld1             {v17.16b}, x2, x3
-    usubl           v1.8h, v16.8b, v17.8b
-    usubl2          v2.8h, v16.16b, v17.16b
-    ld1             {v16.16b}, x0, x1
-    ld1             {v17.16b}, x2, x3
-    smull           v0.4s, v1.4h, v1.4h
-    smlal2          v0.4s, v1.8h, v1.8h
-    smlal           v0.4s, v2.4h, v2.4h
-    smlal2          v0.4s, v2.8h, v2.8h
-.rept \h - 2
-    usubl           v1.8h, v16.8b, v17.8b
-    usubl2          v2.8h, v16.16b, v17.16b
-    ld1             {v16.16b}, x0, x1
-    smlal           v0.4s, v1.4h, v1.4h
-    smlal2          v0.4s, v1.8h, v1.8h
-    ld1             {v17.16b}, x2, x3
-    smlal           v0.4s, v2.4h, v2.4h
-    smlal2          v0.4s, v2.8h, v2.8h
+
+    uabd            v2.16b, v16.16b, v17.16b
+    umull           v20.8h, v2.8b, v2.8b
+    uadalp          v0.4s, v20.8h
+    umull2          v21.8h, v2.16b, v2.16b
+    uadalp          v1.4s, v21.8h
 .endr
-    usubl           v1.8h, v16.8b, v17.8b
-    usubl2          v2.8h, v16.16b, v17.16b
-    smlal           v0.4s, v1.4h, v1.4h
-    smlal2          v0.4s, v1.8h, v1.8h
-    smlal           v0.4s, v2.4h, v2.4h
-    smlal2          v0.4s, v2.8h, v2.8h
+    add             v0.4s, v0.4s, v1.4s
     ret_v0_w0
 endfunc
 .endm
 
-sse_pp_16xN 16
-sse_pp_16xN 32
+SSE_PP_16xN 16
+SSE_PP_16xN 32
 
-function PFX(pixel_sse_pp_32x32_neon)
-    mov             w12, #8
-    movi            v0.16b, #0
-    movi            v1.16b, #0
-.loop_sse_pp_32:
-    sub             w12, w12, #1
+// Loop unrolled to process 4 rows per iteration.
+function PFX(pixel_sse_pp_32xh_neon), export=0
+    movi            v0.4s, #0
+    movi            v1.4s, #0
+.Loop_sse_pp_32xh:
+    sub             w4, w4, #1
 .rept 4
     ld1             {v16.16b,v17.16b}, x0, x1
     ld1             {v18.16b,v19.16b}, x2, x3
-    usubl           v2.8h, v16.8b, v18.8b
-    usubl2          v3.8h, v16.16b, v18.16b
-    usubl           v4.8h, v17.8b, v19.8b
-    usubl2          v5.8h, v17.16b, v19.16b
-    smlal           v0.4s, v2.4h, v2.4h
-    smlal2          v1.4s, v2.8h, v2.8h
-    smlal           v0.4s, v3.4h, v3.4h
-    smlal2          v1.4s, v3.8h, v3.8h
-    smlal           v0.4s, v4.4h, v4.4h
-    smlal2          v1.4s, v4.8h, v4.8h
-    smlal           v0.4s, v5.4h, v5.4h

 
@@ -2,6 +2,7 @@
  * Copyright (C) 2021 MulticoreWare, Inc
  *
  * Authors: Sebastian Pop <spop@amazon.com>
+ *          Hari Limaye <hari.limaye@arm.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -34,217 +35,145 @@
 
 .text
 
-function PFX(pixel_sse_pp_4x4_neon)
-    ld1             {v16.s}0, x0, x1
-    ld1             {v17.s}0, x2, x3
-    ld1             {v18.s}0, x0, x1
-    ld1             {v19.s}0, x2, x3
-    ld1             {v20.s}0, x0, x1
-    ld1             {v21.s}0, x2, x3
-    ld1             {v22.s}0, x0, x1
-    ld1             {v23.s}0, x2, x3
-
-    usubl           v1.8h, v16.8b, v17.8b
-    usubl           v2.8h, v18.8b, v19.8b
-    usubl           v3.8h, v20.8b, v21.8b
-    usubl           v4.8h, v22.8b, v23.8b
-
-    smull           v0.4s, v1.4h, v1.4h
-    smlal           v0.4s, v2.4h, v2.4h
-    smlal           v0.4s, v3.4h, v3.4h
-    smlal           v0.4s, v4.4h, v4.4h
-    ret_v0_w0
-endfunc
+// Fully unrolled.
+.macro SSE_PP_4xN h
+function PFX(pixel_sse_pp_4x\h\()_neon)
+    movi            v0.4s, #0
+.rept \h / 2
+    ldr             s16, x0
+    ldr             s17, x2
+    add             x0, x0, x1
+    add             x2, x2, x3
+    ld1             {v16.s}1, x0, x1
+    ld1             {v17.s}1, x2, x3
 
-function PFX(pixel_sse_pp_4x8_neon)
-    ld1             {v16.s}0, x0, x1
-    ld1             {v17.s}0, x2, x3
-    usubl           v1.8h, v16.8b, v17.8b
-    ld1             {v16.s}0, x0, x1
-    ld1             {v17.s}0, x2, x3
-    smull           v0.4s, v1.4h, v1.4h
-.rept 6
-    usubl           v1.8h, v16.8b, v17.8b
-    ld1             {v16.s}0, x0, x1
-    smlal           v0.4s, v1.4h, v1.4h
-    ld1             {v17.s}0, x2, x3
+    uabd            v1.8b, v16.8b, v17.8b
+    umull           v20.8h, v1.8b, v1.8b
+    uadalp          v0.4s, v20.8h
 .endr
-    usubl           v1.8h, v16.8b, v17.8b
-    smlal           v0.4s, v1.4h, v1.4h
     ret_v0_w0
 endfunc
+.endm
 
-function PFX(pixel_sse_pp_8x8_neon)
-    ld1             {v16.8b}, x0, x1
-    ld1             {v17.8b}, x2, x3
-    usubl           v1.8h, v16.8b, v17.8b
-    ld1             {v16.8b}, x0, x1
-    smull           v0.4s, v1.4h, v1.4h
-    smlal2          v0.4s, v1.8h, v1.8h
-    ld1             {v17.8b}, x2, x3
-
-.rept 6
-    usubl           v1.8h, v16.8b, v17.8b
-    ld1             {v16.8b}, x0, x1
-    smlal           v0.4s, v1.4h, v1.4h
-    smlal2          v0.4s, v1.8h, v1.8h
-    ld1             {v17.8b}, x2, x3
-.endr
-    usubl           v1.8h, v16.8b, v17.8b
-    smlal           v0.4s, v1.4h, v1.4h
-    smlal2          v0.4s, v1.8h, v1.8h
-    ret_v0_w0
-endfunc
+SSE_PP_4xN 4
+SSE_PP_4xN 8
 
-function PFX(pixel_sse_pp_8x16_neon)
-    ld1             {v16.8b}, x0, x1
-    ld1             {v17.8b}, x2, x3
-    usubl           v1.8h, v16.8b, v17.8b
+// Fully unrolled.
+.macro SSE_PP_8xN h
+function PFX(pixel_sse_pp_8x\h\()_neon)
+    movi            v0.4s, #0
+.rept \h
     ld1             {v16.8b}, x0, x1
-    smull           v0.4s, v1.4h, v1.4h
-    smlal2          v0.4s, v1.8h, v1.8h
     ld1             {v17.8b}, x2, x3
 
-.rept 14
-    usubl           v1.8h, v16.8b, v17.8b
-    ld1             {v16.8b}, x0, x1
-    smlal           v0.4s, v1.4h, v1.4h
-    smlal2          v0.4s, v1.8h, v1.8h
-    ld1             {v17.8b}, x2, x3
+    uabd            v1.8b, v16.8b, v17.8b
+    umull           v20.8h, v1.8b, v1.8b
+    uadalp          v0.4s, v20.8h
 .endr
-    usubl           v1.8h, v16.8b, v17.8b
-    smlal           v0.4s, v1.4h, v1.4h
-    smlal2          v0.4s, v1.8h, v1.8h
     ret_v0_w0
 endfunc
+.endm
+
+SSE_PP_8xN 8
+SSE_PP_8xN 16
 
-.macro sse_pp_16xN h
+// Fully unrolled.
+.macro SSE_PP_16xN h
 function PFX(pixel_sse_pp_16x\h\()_neon)
+    movi            v0.4s, #0
+    movi            v1.4s, #0
+.rept \h
     ld1             {v16.16b}, x0, x1
     ld1             {v17.16b}, x2, x3
-    usubl           v1.8h, v16.8b, v17.8b
-    usubl2          v2.8h, v16.16b, v17.16b
-    ld1             {v16.16b}, x0, x1
-    ld1             {v17.16b}, x2, x3
-    smull           v0.4s, v1.4h, v1.4h
-    smlal2          v0.4s, v1.8h, v1.8h
-    smlal           v0.4s, v2.4h, v2.4h
-    smlal2          v0.4s, v2.8h, v2.8h
-.rept \h - 2
-    usubl           v1.8h, v16.8b, v17.8b
-    usubl2          v2.8h, v16.16b, v17.16b
-    ld1             {v16.16b}, x0, x1
-    smlal           v0.4s, v1.4h, v1.4h
-    smlal2          v0.4s, v1.8h, v1.8h
-    ld1             {v17.16b}, x2, x3
-    smlal           v0.4s, v2.4h, v2.4h
-    smlal2          v0.4s, v2.8h, v2.8h
+
+    uabd            v2.16b, v16.16b, v17.16b
+    umull           v20.8h, v2.8b, v2.8b
+    uadalp          v0.4s, v20.8h
+    umull2          v21.8h, v2.16b, v2.16b
+    uadalp          v1.4s, v21.8h
 .endr
-    usubl           v1.8h, v16.8b, v17.8b
-    usubl2          v2.8h, v16.16b, v17.16b
-    smlal           v0.4s, v1.4h, v1.4h
-    smlal2          v0.4s, v1.8h, v1.8h
-    smlal           v0.4s, v2.4h, v2.4h
-    smlal2          v0.4s, v2.8h, v2.8h
+    add             v0.4s, v0.4s, v1.4s
     ret_v0_w0
 endfunc
 .endm
 
-sse_pp_16xN 16
-sse_pp_16xN 32
+SSE_PP_16xN 16
+SSE_PP_16xN 32
 
-function PFX(pixel_sse_pp_32x32_neon)
-    mov             w12, #8
-    movi            v0.16b, #0
-    movi            v1.16b, #0
-.loop_sse_pp_32:
-    sub             w12, w12, #1
+// Loop unrolled to process 4 rows per iteration.
+function PFX(pixel_sse_pp_32xh_neon), export=0
+    movi            v0.4s, #0
+    movi            v1.4s, #0
+.Loop_sse_pp_32xh:
+    sub             w4, w4, #1
 .rept 4
     ld1             {v16.16b,v17.16b}, x0, x1
     ld1             {v18.16b,v19.16b}, x2, x3
-    usubl           v2.8h, v16.8b, v18.8b
-    usubl2          v3.8h, v16.16b, v18.16b
-    usubl           v4.8h, v17.8b, v19.8b
-    usubl2          v5.8h, v17.16b, v19.16b
-    smlal           v0.4s, v2.4h, v2.4h
-    smlal2          v1.4s, v2.8h, v2.8h
-    smlal           v0.4s, v3.4h, v3.4h
-    smlal2          v1.4s, v3.8h, v3.8h
-    smlal           v0.4s, v4.4h, v4.4h
-    smlal2          v1.4s, v4.8h, v4.8h
-    smlal           v0.4s, v5.4h, v5.4h
​

x265_4.0.tar.gz/source/common/aarch64/ssd-neon-dotprod.S Added

@@ -0,0 +1,169 @@
+/*****************************************************************************
+ * Copyright (C) 2024 MulticoreWare, Inc
+ *
+ * Authors: Hari Limaye <hari.limaye@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.arch armv8.2-a+dotprod
+
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
+.section .rodata
+#endif
+
+.align 4
+
+.text
+
+// Fully unrolled.
+.macro SSE_PP_4xN h
+function PFX(pixel_sse_pp_4x\h\()_neon_dotprod)
+    movi            v0.4s, #0
+.rept \h / 4
+    ldr             s16, x0
+    ldr             s17, x2
+    add             x0, x0, x1
+    add             x2, x2, x3
+    ld1             {v16.s}1, x0, x1
+    ld1             {v16.s}2, x0, x1
+    ld1             {v16.s}3, x0, x1
+    ld1             {v17.s}1, x2, x3
+    ld1             {v17.s}2, x2, x3
+    ld1             {v17.s}3, x2, x3
+
+    uabd            v1.16b, v16.16b, v17.16b
+    udot            v0.4s, v1.16b, v1.16b
+.endr
+    addv            s0, v0.4s
+    fmov            w0, s0
+    ret
+endfunc
+.endm
+
+SSE_PP_4xN 4
+SSE_PP_4xN 8
+
+// Fully unrolled.
+.macro SSE_PP_8xN h
+function PFX(pixel_sse_pp_8x\h\()_neon_dotprod)
+    movi            v0.4s, #0
+.rept \h
+    ld1             {v16.8b}, x0, x1
+    ld1             {v17.8b}, x2, x3
+
+    uabd            v1.8b, v16.8b, v17.8b
+    udot            v0.2s, v1.8b, v1.8b
+.endr
+    addv            s0, v0.4s
+    fmov            w0, s0
+    ret
+endfunc
+.endm
+
+SSE_PP_8xN 8
+SSE_PP_8xN 16
+
+// Fully unrolled.
+.macro SSE_PP_16xN h
+function PFX(pixel_sse_pp_16x\h\()_neon_dotprod)
+    movi            v0.4s, #0
+    movi            v1.4s, #0
+.rept \h / 2
+    ld1             {v16.16b}, x0, x1
+    ld1             {v17.16b}, x2, x3
+    ld1             {v18.16b}, x0, x1
+    ld1             {v19.16b}, x2, x3
+
+    uabd            v2.16b, v16.16b, v17.16b
+    udot            v0.4s, v2.16b, v2.16b
+    uabd            v3.16b, v18.16b, v19.16b
+    udot            v1.4s, v3.16b, v3.16b
+.endr
+    add             v0.4s, v0.4s, v1.4s
+    addv            s0, v0.4s
+    fmov            w0, s0
+    ret
+endfunc
+.endm
+
+SSE_PP_16xN 16
+SSE_PP_16xN 32
+
+// Loop unrolled to process 4 rows per iteration.
+function PFX(pixel_sse_pp_32xh_neon_dotprod), export=0
+    movi            v0.4s, #0
+    movi            v1.4s, #0
+.Loop_sse_pp_32xh:
+    sub             w4, w4, #1
+.rept 4
+    ld1             {v16.16b,v17.16b}, x0, x1
+    ld1             {v18.16b,v19.16b}, x2, x3
+
+    uabd            v2.16b, v16.16b, v18.16b
+    udot            v0.4s, v2.16b, v2.16b
+    uabd            v3.16b, v17.16b, v19.16b
+    udot            v1.4s, v3.16b, v3.16b
+.endr
+    cbnz            w4, .Loop_sse_pp_32xh
+    add             v0.4s, v0.4s, v1.4s
+    addv            s0, v0.4s
+    fmov            w0, s0
+    ret
+endfunc
+
+.macro SSE_PP_32xN h
+function PFX(pixel_sse_pp_32x\h\()_neon_dotprod)
+    mov             w4, \h / 4
+    b               PFX(pixel_sse_pp_32xh_neon_dotprod)
+endfunc
+.endm
+
+SSE_PP_32xN 32
+SSE_PP_32xN 64
+
+// Loop unrolled to process 4 rows per iteration.
+function PFX(pixel_sse_pp_64x64_neon_dotprod)
+    mov             w12, #16
+    movi            v0.4s, #0
+    movi            v1.4s, #0
+.Loop_sse_pp_64:
+    sub             w12, w12, #1
+.rept 4
+    ld1             {v16.16b-v19.16b}, x0, x1
+    ld1             {v20.16b-v23.16b}, x2, x3
+
+    uabd            v2.16b, v16.16b, v20.16b
+    udot            v0.4s, v2.16b, v2.16b
+    uabd            v3.16b, v17.16b, v21.16b
+    udot            v1.4s, v3.16b, v3.16b
+    uabd            v4.16b, v18.16b, v22.16b
+    udot            v0.4s, v4.16b, v4.16b
+    uabd            v5.16b, v19.16b, v23.16b
+    udot            v1.4s, v5.16b, v5.16b
+.endr
+    cbnz            w12, .Loop_sse_pp_64
+    add             v0.4s, v0.4s, v1.4s
+    addv            s0, v0.4s
+    fmov            w0, s0
+    ret
+endfunc

 
@@ -0,0 +1,169 @@
+/*****************************************************************************
+ * Copyright (C) 2024 MulticoreWare, Inc
+ *
+ * Authors: Hari Limaye <hari.limaye@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.arch armv8.2-a+dotprod
+
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
+.section .rodata
+#endif
+
+.align 4
+
+.text
+
+// Fully unrolled.
+.macro SSE_PP_4xN h
+function PFX(pixel_sse_pp_4x\h\()_neon_dotprod)
+    movi            v0.4s, #0
+.rept \h / 4
+    ldr             s16, x0
+    ldr             s17, x2
+    add             x0, x0, x1
+    add             x2, x2, x3
+    ld1             {v16.s}1, x0, x1
+    ld1             {v16.s}2, x0, x1
+    ld1             {v16.s}3, x0, x1
+    ld1             {v17.s}1, x2, x3
+    ld1             {v17.s}2, x2, x3
+    ld1             {v17.s}3, x2, x3
+
+    uabd            v1.16b, v16.16b, v17.16b
+    udot            v0.4s, v1.16b, v1.16b
+.endr
+    addv            s0, v0.4s
+    fmov            w0, s0
+    ret
+endfunc
+.endm
+
+SSE_PP_4xN 4
+SSE_PP_4xN 8
+
+// Fully unrolled.
+.macro SSE_PP_8xN h
+function PFX(pixel_sse_pp_8x\h\()_neon_dotprod)
+    movi            v0.4s, #0
+.rept \h
+    ld1             {v16.8b}, x0, x1
+    ld1             {v17.8b}, x2, x3
+
+    uabd            v1.8b, v16.8b, v17.8b
+    udot            v0.2s, v1.8b, v1.8b
+.endr
+    addv            s0, v0.4s
+    fmov            w0, s0
+    ret
+endfunc
+.endm
+
+SSE_PP_8xN 8
+SSE_PP_8xN 16
+
+// Fully unrolled.
+.macro SSE_PP_16xN h
+function PFX(pixel_sse_pp_16x\h\()_neon_dotprod)
+    movi            v0.4s, #0
+    movi            v1.4s, #0
+.rept \h / 2
+    ld1             {v16.16b}, x0, x1
+    ld1             {v17.16b}, x2, x3
+    ld1             {v18.16b}, x0, x1
+    ld1             {v19.16b}, x2, x3
+
+    uabd            v2.16b, v16.16b, v17.16b
+    udot            v0.4s, v2.16b, v2.16b
+    uabd            v3.16b, v18.16b, v19.16b
+    udot            v1.4s, v3.16b, v3.16b
+.endr
+    add             v0.4s, v0.4s, v1.4s
+    addv            s0, v0.4s
+    fmov            w0, s0
+    ret
+endfunc
+.endm
+
+SSE_PP_16xN 16
+SSE_PP_16xN 32
+
+// Loop unrolled to process 4 rows per iteration.
+function PFX(pixel_sse_pp_32xh_neon_dotprod), export=0
+    movi            v0.4s, #0
+    movi            v1.4s, #0
+.Loop_sse_pp_32xh:
+    sub             w4, w4, #1
+.rept 4
+    ld1             {v16.16b,v17.16b}, x0, x1
+    ld1             {v18.16b,v19.16b}, x2, x3
+
+    uabd            v2.16b, v16.16b, v18.16b
+    udot            v0.4s, v2.16b, v2.16b
+    uabd            v3.16b, v17.16b, v19.16b
+    udot            v1.4s, v3.16b, v3.16b
+.endr
+    cbnz            w4, .Loop_sse_pp_32xh
+    add             v0.4s, v0.4s, v1.4s
+    addv            s0, v0.4s
+    fmov            w0, s0
+    ret
+endfunc
+
+.macro SSE_PP_32xN h
+function PFX(pixel_sse_pp_32x\h\()_neon_dotprod)
+    mov             w4, \h / 4
+    b               PFX(pixel_sse_pp_32xh_neon_dotprod)
+endfunc
+.endm
+
+SSE_PP_32xN 32
+SSE_PP_32xN 64
+
+// Loop unrolled to process 4 rows per iteration.
+function PFX(pixel_sse_pp_64x64_neon_dotprod)
+    mov             w12, #16
+    movi            v0.4s, #0
+    movi            v1.4s, #0
+.Loop_sse_pp_64:
+    sub             w12, w12, #1
+.rept 4
+    ld1             {v16.16b-v19.16b}, x0, x1
+    ld1             {v20.16b-v23.16b}, x2, x3
+
+    uabd            v2.16b, v16.16b, v20.16b
+    udot            v0.4s, v2.16b, v2.16b
+    uabd            v3.16b, v17.16b, v21.16b
+    udot            v1.4s, v3.16b, v3.16b
+    uabd            v4.16b, v18.16b, v22.16b
+    udot            v0.4s, v4.16b, v4.16b
+    uabd            v5.16b, v19.16b, v23.16b
+    udot            v1.4s, v5.16b, v5.16b
+.endr
+    cbnz            w12, .Loop_sse_pp_64
+    add             v0.4s, v0.4s, v1.4s
+    addv            s0, v0.4s
+    fmov            w0, s0
+    ret
+endfunc
​

x265_3.6.tar.gz/source/common/arm/blockcopy8.S -> x265_4.0.tar.gz/source/common/arm/blockcopy8.S Changed

 
@@ -795,7 +795,7 @@
     vmov            q2, q12
     vmov            q3, q14
 
-.loop:    
+.Loop:
     vldm            r0!, {q8-q15}
     subs            r1, #1
 
@@ -817,7 +817,7 @@
     vadd.s8         q1, q10
     vadd.s8         q2, q12
     vadd.s8         q3, q14
-    bgt            .loop
+    bgt            .Loop
 
     // sum
     vadd.s8         q0, q1
​

x265_3.6.tar.gz/source/common/arm/dct-a.S -> x265_4.0.tar.gz/source/common/arm/dct-a.S Changed

 
@@ -422,7 +422,7 @@
     mov lr, #4*16*2
 
     // DCT-1D
-.loop1:
+.Loop1:
     // Row0-3
     vld1.16 {q8-q9}, r0, :64, r2      // q8  = 07 06 05 04 03 02 01 00, q9  = 0F 0E 0D 0C 0B 0A 09 08
     vld1.16 {q10-q11}, r0, :64, r2    // q10 = 17 16 15 14 13 12 11 10, q11 = 1F 1E 1D 1C 1B 1A 19 18
@@ -628,7 +628,7 @@
     // loop into next process group
     sub r3, #3*4*16*2
     subs r12, #1
-    bgt .loop1
+    bgt .Loop1
 
 
     // DCT-2D
@@ -637,7 +637,7 @@
     mov r3, #16*2*2
     mov r12, #16/4                      // Process 4 rows every loop
 
-.loop2:
+.Loop2:
     vldm r2, {q8-q15}
 
     // d16 = 30 20 10 00
@@ -887,7 +887,7 @@
 
     sub r1, #(17*16-4)*2
     subs r12, #1
-    bgt .loop2
+    bgt .Loop2
 
     add sp, #16*16*2
     vpop {q4-q7}
​

x265_3.6.tar.gz/source/common/arm/ipfilter8.S -> x265_4.0.tar.gz/source/common/arm/ipfilter8.S Changed

@@ -372,7 +372,7 @@
     vmov.u16    q1, #8192
     vneg.s16    q1, q1
     mov         r12, #8
-.loop_filterP2S_32x16:
+.Loop_filterP2S_32x16:
     subs        r12, #1
 .rept 2
     vld1.u8     {q9-q10}, r0, r1
@@ -391,7 +391,7 @@
     vmla.s16    q3, q10, q0
     vst1.16     {q2-q3}, r2, r3
 .endr
-    bgt         .loop_filterP2S_32x16
+    bgt         .Loop_filterP2S_32x16
     bx          lr
 endfunc
 
@@ -402,7 +402,7 @@
     vmov.u16    q1, #8192
     vneg.s16    q1, q1
     mov         r12, #12
-.loop_filterP2S_32x24:
+.Loop_filterP2S_32x24:
     subs        r12, #1
 .rept 2
     vld1.u8     {q9-q10}, r0, r1
@@ -421,7 +421,7 @@
     vmla.s16    q3, q10, q0
     vst1.16     {q2-q3}, r2, r3
 .endr
-    bgt         .loop_filterP2S_32x24
+    bgt         .Loop_filterP2S_32x24
     bx          lr
 endfunc
 
@@ -432,7 +432,7 @@
     vmov.u16    q1, #8192
     vneg.s16    q1, q1
     mov         r12, #16
-.loop_filterP2S_32x32:
+.Loop_filterP2S_32x32:
     subs        r12, #1
 .rept 2
     vld1.u8     {q9-q10}, r0, r1
@@ -451,7 +451,7 @@
     vmla.s16    q3, q10, q0
     vst1.16     {q2-q3}, r2, r3
 .endr
-    bgt         .loop_filterP2S_32x32
+    bgt         .Loop_filterP2S_32x32
     bx          lr
 endfunc
 
@@ -462,7 +462,7 @@
     vmov.u16    q1, #8192
     vneg.s16    q1, q1
     mov         r12, #32
-.loop_filterP2S_32x64:
+.Loop_filterP2S_32x64:
     subs        r12, #1
 .rept 2
     vld1.u8     {q9-q10}, r0, r1
@@ -481,7 +481,7 @@
     vmla.s16    q3, q10, q0
     vst1.16     {q2-q3}, r2, r3
 .endr
-    bgt         .loop_filterP2S_32x64
+    bgt         .Loop_filterP2S_32x64
     bx          lr
 endfunc
 
@@ -493,7 +493,7 @@
     vmov.u16    q1, #8192
     vneg.s16    q1, q1
     mov         r12, #8
-.loop_filterP2S_64x16:
+.Loop_filterP2S_64x16:
     subs        r12, #1
 .rept 2
     vld1.u8     {q9-q10}, r0!
@@ -528,7 +528,7 @@
     vmla.s16    q3, q10, q0
     vst1.16     {q2-q3}, r2, r3
 .endr
-    bgt         .loop_filterP2S_64x16
+    bgt         .Loop_filterP2S_64x16
     bx          lr
 endfunc
 
@@ -540,7 +540,7 @@
     vmov.u16    q1, #8192
     vneg.s16    q1, q1
     mov         r12, #16
-.loop_filterP2S_64x32:
+.Loop_filterP2S_64x32:
     subs        r12, #1
 .rept 2
     vld1.u8     {q9-q10}, r0!
@@ -575,7 +575,7 @@
     vmla.s16    q3, q10, q0
     vst1.16     {q2-q3}, r2, r3
 .endr
-    bgt         .loop_filterP2S_64x32
+    bgt         .Loop_filterP2S_64x32
     bx          lr
 endfunc
 
@@ -587,7 +587,7 @@
     vmov.u16    q1, #8192
     vneg.s16    q1, q1
     mov         r12, #24
-.loop_filterP2S_64x48:
+.Loop_filterP2S_64x48:
     subs        r12, #1
 .rept 2
     vld1.u8     {q9-q10}, r0!
@@ -622,7 +622,7 @@
     vmla.s16    q3, q10, q0
     vst1.16     {q2-q3}, r2, r3
 .endr
-    bgt         .loop_filterP2S_64x48
+    bgt         .Loop_filterP2S_64x48
     bx          lr
 endfunc
 
@@ -634,7 +634,7 @@
     vmov.u16    q1, #8192
     vneg.s16    q1, q1
     mov         r12, #32
-.loop_filterP2S_64x64:
+.Loop_filterP2S_64x64:
     subs        r12, #1
 .rept 2
     vld1.u8     {q9-q10}, r0!
@@ -669,7 +669,7 @@
     vmla.s16    q3, q10, q0
     vst1.16     {q2-q3}, r2, r3
 .endr
-    bgt         .loop_filterP2S_64x64
+    bgt         .Loop_filterP2S_64x64
     bx          lr
 endfunc
 
@@ -681,7 +681,7 @@
     vmov.u16    q1, #8192
     vneg.s16    q1, q1
     mov         r12, #32
-.loop_filterP2S_48x64:
+.Loop_filterP2S_48x64:
     subs        r12, #1
 .rept 2
     vld1.u8     {q9-q10}, r0!
@@ -709,7 +709,7 @@
     vmla.s16    q3, q9, q0
     vst1.16     {q2-q3}, r2, r3
 .endr
-    bgt         .loop_filterP2S_48x64
+    bgt         .Loop_filterP2S_48x64
     bx          lr
 endfunc
 
@@ -756,7 +756,7 @@
     vmovl.u8    q2, d4
     vmovl.u8    q3, d6
 
-.loop_4x\h:
+.Loop_4x\h:
     // TODO: read extra 1 row for speed optimize, may made crash on OS X platform!
     vld1.u32    {d160}, r0, r1
     vld1.u32    {d161}, r0, r1
@@ -795,7 +795,7 @@
     vst1.u32    {d181}, r2, r3
 
     subs        r12, #2
-    bne        .loop_4x4
+    bne        .Loop_4x4
 
     pop         {pc}
     .ltorg
@@ -945,13 +945,13 @@
 
 .macro FILTER_VPP a b filterv
 
-.loop_\filterv\()_\a\()x\b:
+.Loop_\filterv\()_\a\()x\b:
 
     mov             r7, r2
     mov             r6, r0
     eor             r8, r8
 
-.loop_w8_\filterv\()_\a\()x\b:
+.Loop_w8_\filterv\()_\a\()x\b:
 
     add             r6, r0, r8
 
@@ -988,12 +988,12 @@
 
     add             r8, #8
     cmp             r8, #\a

 
@@ -372,7 +372,7 @@
     vmov.u16    q1, #8192
     vneg.s16    q1, q1
     mov         r12, #8
-.loop_filterP2S_32x16:
+.Loop_filterP2S_32x16:
     subs        r12, #1
 .rept 2
     vld1.u8     {q9-q10}, r0, r1
@@ -391,7 +391,7 @@
     vmla.s16    q3, q10, q0
     vst1.16     {q2-q3}, r2, r3
 .endr
-    bgt         .loop_filterP2S_32x16
+    bgt         .Loop_filterP2S_32x16
     bx          lr
 endfunc
 
@@ -402,7 +402,7 @@
     vmov.u16    q1, #8192
     vneg.s16    q1, q1
     mov         r12, #12
-.loop_filterP2S_32x24:
+.Loop_filterP2S_32x24:
     subs        r12, #1
 .rept 2
     vld1.u8     {q9-q10}, r0, r1
@@ -421,7 +421,7 @@
     vmla.s16    q3, q10, q0
     vst1.16     {q2-q3}, r2, r3
 .endr
-    bgt         .loop_filterP2S_32x24
+    bgt         .Loop_filterP2S_32x24
     bx          lr
 endfunc
 
@@ -432,7 +432,7 @@
     vmov.u16    q1, #8192
     vneg.s16    q1, q1
     mov         r12, #16
-.loop_filterP2S_32x32:
+.Loop_filterP2S_32x32:
     subs        r12, #1
 .rept 2
     vld1.u8     {q9-q10}, r0, r1
@@ -451,7 +451,7 @@
     vmla.s16    q3, q10, q0
     vst1.16     {q2-q3}, r2, r3
 .endr
-    bgt         .loop_filterP2S_32x32
+    bgt         .Loop_filterP2S_32x32
     bx          lr
 endfunc
 
@@ -462,7 +462,7 @@
     vmov.u16    q1, #8192
     vneg.s16    q1, q1
     mov         r12, #32
-.loop_filterP2S_32x64:
+.Loop_filterP2S_32x64:
     subs        r12, #1
 .rept 2
     vld1.u8     {q9-q10}, r0, r1
@@ -481,7 +481,7 @@
     vmla.s16    q3, q10, q0
     vst1.16     {q2-q3}, r2, r3
 .endr
-    bgt         .loop_filterP2S_32x64
+    bgt         .Loop_filterP2S_32x64
     bx          lr
 endfunc
 
@@ -493,7 +493,7 @@
     vmov.u16    q1, #8192
     vneg.s16    q1, q1
     mov         r12, #8
-.loop_filterP2S_64x16:
+.Loop_filterP2S_64x16:
     subs        r12, #1
 .rept 2
     vld1.u8     {q9-q10}, r0!
@@ -528,7 +528,7 @@
     vmla.s16    q3, q10, q0
     vst1.16     {q2-q3}, r2, r3
 .endr
-    bgt         .loop_filterP2S_64x16
+    bgt         .Loop_filterP2S_64x16
     bx          lr
 endfunc
 
@@ -540,7 +540,7 @@
     vmov.u16    q1, #8192
     vneg.s16    q1, q1
     mov         r12, #16
-.loop_filterP2S_64x32:
+.Loop_filterP2S_64x32:
     subs        r12, #1
 .rept 2
     vld1.u8     {q9-q10}, r0!
@@ -575,7 +575,7 @@
     vmla.s16    q3, q10, q0
     vst1.16     {q2-q3}, r2, r3
 .endr
-    bgt         .loop_filterP2S_64x32
+    bgt         .Loop_filterP2S_64x32
     bx          lr
 endfunc
 
@@ -587,7 +587,7 @@
     vmov.u16    q1, #8192
     vneg.s16    q1, q1
     mov         r12, #24
-.loop_filterP2S_64x48:
+.Loop_filterP2S_64x48:
     subs        r12, #1
 .rept 2
     vld1.u8     {q9-q10}, r0!
@@ -622,7 +622,7 @@
     vmla.s16    q3, q10, q0
     vst1.16     {q2-q3}, r2, r3
 .endr
-    bgt         .loop_filterP2S_64x48
+    bgt         .Loop_filterP2S_64x48
     bx          lr
 endfunc
 
@@ -634,7 +634,7 @@
     vmov.u16    q1, #8192
     vneg.s16    q1, q1
     mov         r12, #32
-.loop_filterP2S_64x64:
+.Loop_filterP2S_64x64:
     subs        r12, #1
 .rept 2
     vld1.u8     {q9-q10}, r0!
@@ -669,7 +669,7 @@
     vmla.s16    q3, q10, q0
     vst1.16     {q2-q3}, r2, r3
 .endr
-    bgt         .loop_filterP2S_64x64
+    bgt         .Loop_filterP2S_64x64
     bx          lr
 endfunc
 
@@ -681,7 +681,7 @@
     vmov.u16    q1, #8192
     vneg.s16    q1, q1
     mov         r12, #32
-.loop_filterP2S_48x64:
+.Loop_filterP2S_48x64:
     subs        r12, #1
 .rept 2
     vld1.u8     {q9-q10}, r0!
@@ -709,7 +709,7 @@
     vmla.s16    q3, q9, q0
     vst1.16     {q2-q3}, r2, r3
 .endr
-    bgt         .loop_filterP2S_48x64
+    bgt         .Loop_filterP2S_48x64
     bx          lr
 endfunc
 
@@ -756,7 +756,7 @@
     vmovl.u8    q2, d4
     vmovl.u8    q3, d6
 
-.loop_4x\h:
+.Loop_4x\h:
     // TODO: read extra 1 row for speed optimize, may made crash on OS X platform!
     vld1.u32    {d160}, r0, r1
     vld1.u32    {d161}, r0, r1
@@ -795,7 +795,7 @@
     vst1.u32    {d181}, r2, r3
 
     subs        r12, #2
-    bne        .loop_4x4
+    bne        .Loop_4x4
 
     pop         {pc}
     .ltorg
@@ -945,13 +945,13 @@
 
 .macro FILTER_VPP a b filterv
 
-.loop_\filterv\()_\a\()x\b:
+.Loop_\filterv\()_\a\()x\b:
 
     mov             r7, r2
     mov             r6, r0
     eor             r8, r8
 
-.loop_w8_\filterv\()_\a\()x\b:
+.Loop_w8_\filterv\()_\a\()x\b:
 
     add             r6, r0, r8
 
@@ -988,12 +988,12 @@
 
     add             r8, #8
     cmp             r8, #\a
​

x265_3.6.tar.gz/source/common/arm/mc-a.S -> x265_4.0.tar.gz/source/common/arm/mc-a.S Changed

 
@@ -554,7 +554,7 @@
     vsri.s16        q1, #1
     vneg.s16        q0, q0
     mov             r3, #4
-.loop_cpy2Dto1D_shr_16:
+.Loop_cpy2Dto1D_shr_16:
     subs            r3, #1
 .rept 4
     vld1.s16        {q2-q3}, r1, r2
@@ -564,7 +564,7 @@
     vshl.s16        q3, q0
     vst1.16         {q2-q3}, r0!
 .endr
-    bgt             .loop_cpy2Dto1D_shr_16
+    bgt             .Loop_cpy2Dto1D_shr_16
     bx              lr
 endfunc
 
@@ -577,7 +577,7 @@
     vsri.s16        q1, #1
     vneg.s16        q0, q0
     mov             r3, 16
-.loop_cpy2Dto1D_shr_32:
+.Loop_cpy2Dto1D_shr_32:
     subs            r3, #1
 .rept 2
     vld1.s16        {q2-q3}, r1!
@@ -593,7 +593,7 @@
     vst1.16         {q2-q3}, r0!
     vst1.16         {q8-q9}, r0!
 .endr
-    bgt             .loop_cpy2Dto1D_shr_32
+    bgt             .Loop_cpy2Dto1D_shr_32
     bx              lr
 endfunc
 
​

x265_3.6.tar.gz/source/common/arm/pixel-util.S -> x265_4.0.tar.gz/source/common/arm/pixel-util.S Changed

@@ -848,36 +848,36 @@
     vdup.8          q2, r12
     sub             r5, #1
 
-.loop_h:
+.Loop_h:
     mov             r6, r0
     mov             r12, r2
     eor             r7, r7
-.loop_w:
+.Loop_w:
     vld1.u8         {q0}, r6!
     vshl.u8         q0, q0, q2
     vst1.u8         {q0}, r12!
 
     add             r7, #16
     cmp             r7, r4
-    blt             .loop_w
+    blt             .Loop_w
 
     add             r0, r1
     add             r2, r3
 
     subs             r5, #1
-    bgt             .loop_h
+    bgt             .Loop_h
 
 // handle last row
     mov             r5, r4
     lsr             r5, #3
 
-.loopW8:
+.LoopW8:
     vld1.u8         d0, r0!
     vshl.u8         d0, d0, d4
     vst1.u8         d0, r2!
     subs            r4, r4, #8
     subs            r5, #1
-    bgt             .loopW8
+    bgt             .LoopW8
 
     mov             r5,#8
     sub             r5, r4
@@ -1970,7 +1970,7 @@
     eor             r5, r5
     veor.s32        q12, q12
 
-.loop_quant:
+.Loop_quant:
 
     vld1.s16        d16, r0!
     vmovl.s16       q9, d16                // q9= coefblockpos
@@ -1999,7 +1999,7 @@
     vst1.s16        d16, r3!
 
     subs            r4, #1
-    bne             .loop_quant
+    bne             .Loop_quant
 
     vadd.u32        d8, d9
     vpadd.u32       d8, d8
@@ -2023,7 +2023,7 @@
     eor             r4, r4
     veor.s32        q12, q12
 
-.loop_nquant:
+.Loop_nquant:
 
     vld1.s16        d16, r0!
     vmovl.s16       q9, d16                // q9= coefblockpos
@@ -2049,7 +2049,7 @@
     vst1.s16        d17, r2!
 
     subs            r3, #1
-    bne             .loop_nquant
+    bne             .Loop_nquant
 
     vadd.u32        d8, d9
     vpadd.u32       d8, d8
@@ -2148,7 +2148,7 @@
     mov             r10, #4
     eor             r9, r9
 
-.loop_32:
+.Loop_32:
 
     sa8d_16x16 r4
 
@@ -2166,7 +2166,7 @@
     sub             r2,  r2,  #24
 
     subs            r10, #1
-    bgt            .loop_32
+    bgt            .Loop_32
 
     mov             r0, r9
     vpop            {d8-d11}
@@ -2183,7 +2183,7 @@
     mov             r10, #4
     eor             r9, r9
 
-.loop_1:
+.Loop_1:
 
     sa8d_16x16 r4
 
@@ -2217,7 +2217,7 @@
     sub             r2,  r2,  #56
 
     subs            r10, #1
-    bgt            .loop_1
+    bgt            .Loop_1
 
     mov             r0, r9
     vpop            {d8-d11}

 
@@ -848,36 +848,36 @@
     vdup.8          q2, r12
     sub             r5, #1
 
-.loop_h:
+.Loop_h:
     mov             r6, r0
     mov             r12, r2
     eor             r7, r7
-.loop_w:
+.Loop_w:
     vld1.u8         {q0}, r6!
     vshl.u8         q0, q0, q2
     vst1.u8         {q0}, r12!
 
     add             r7, #16
     cmp             r7, r4
-    blt             .loop_w
+    blt             .Loop_w
 
     add             r0, r1
     add             r2, r3
 
     subs             r5, #1
-    bgt             .loop_h
+    bgt             .Loop_h
 
 // handle last row
     mov             r5, r4
     lsr             r5, #3
 
-.loopW8:
+.LoopW8:
     vld1.u8         d0, r0!
     vshl.u8         d0, d0, d4
     vst1.u8         d0, r2!
     subs            r4, r4, #8
     subs            r5, #1
-    bgt             .loopW8
+    bgt             .LoopW8
 
     mov             r5,#8
     sub             r5, r4
@@ -1970,7 +1970,7 @@
     eor             r5, r5
     veor.s32        q12, q12
 
-.loop_quant:
+.Loop_quant:
 
     vld1.s16        d16, r0!
     vmovl.s16       q9, d16                // q9= coefblockpos
@@ -1999,7 +1999,7 @@
     vst1.s16        d16, r3!
 
     subs            r4, #1
-    bne             .loop_quant
+    bne             .Loop_quant
 
     vadd.u32        d8, d9
     vpadd.u32       d8, d8
@@ -2023,7 +2023,7 @@
     eor             r4, r4
     veor.s32        q12, q12
 
-.loop_nquant:
+.Loop_nquant:
 
     vld1.s16        d16, r0!
     vmovl.s16       q9, d16                // q9= coefblockpos
@@ -2049,7 +2049,7 @@
     vst1.s16        d17, r2!
 
     subs            r3, #1
-    bne             .loop_nquant
+    bne             .Loop_nquant
 
     vadd.u32        d8, d9
     vpadd.u32       d8, d8
@@ -2148,7 +2148,7 @@
     mov             r10, #4
     eor             r9, r9
 
-.loop_32:
+.Loop_32:
 
     sa8d_16x16 r4
 
@@ -2166,7 +2166,7 @@
     sub             r2,  r2,  #24
 
     subs            r10, #1
-    bgt            .loop_32
+    bgt            .Loop_32
 
     mov             r0, r9
     vpop            {d8-d11}
@@ -2183,7 +2183,7 @@
     mov             r10, #4
     eor             r9, r9
 
-.loop_1:
+.Loop_1:
 
     sa8d_16x16 r4
 
@@ -2217,7 +2217,7 @@
     sub             r2,  r2,  #56
 
     subs            r10, #1
-    bgt            .loop_1
+    bgt            .Loop_1
 
     mov             r0, r9
     vpop            {d8-d11}
​

x265_3.6.tar.gz/source/common/arm/sad-a.S -> x265_4.0.tar.gz/source/common/arm/sad-a.S Changed

@@ -103,7 +103,7 @@
     vabal.u8        q9, d5, d7
     mov             r12, #(\h-2)/2
 
-.loop_16x\h:
+.Loop_16x\h:
 
     subs            r12, #1
     vld1.8          {q0}, r0, r1
@@ -115,7 +115,7 @@
     vabal.u8        q9, d1, d3
     vabal.u8        q8, d4, d6
     vabal.u8        q9, d5, d7
-    bne             .loop_16x\h
+    bne             .Loop_16x\h
 
     vadd.u16        q8, q8, q9
 .if \h == 64
@@ -147,7 +147,7 @@
     veor.u8         q11, q11
     mov             r12, #\h/8
 
-.loop_32x\h:
+.Loop_32x\h:
 
     subs            r12, #1
 .rept 4
@@ -166,7 +166,7 @@
     vabal.u8        q10, d26, d30
     vabal.u8        q11, d27, d31
 .endr
-    bne             .loop_32x\h
+    bne             .Loop_32x\h
 
     vadd.u16        q8, q8, q9
     vadd.u16        q10, q10, q11
@@ -213,7 +213,7 @@
     sub             r3, r12
     mov             r12, #\h/8
 
-.loop_64x\h:
+.Loop_64x\h:
 
     subs            r12, #1
 .rept 4
@@ -246,7 +246,7 @@
     vabal.u8        q10, d26, d30
     vabal.u8        q11, d27, d31
 .endr
-    bne             .loop_64x\h
+    bne             .Loop_64x\h
 
     vadd.u16        q8, q8, q9
     vadd.u16        q10, q10, q11
@@ -283,7 +283,7 @@
     sub             r3, #16
     mov             r12, #8
 
-.loop_24x32:
+.Loop_24x32:
 
     subs            r12, #1
 .rept 4
@@ -296,7 +296,7 @@
     vld1.8          {d1}, r2, r3
     vabal.u8        q10, d0, d1
 .endr
-    bne             .loop_24x32
+    bne             .Loop_24x32
 
     vadd.u16        q8, q8, q9
     vadd.u16        d16, d16, d17
@@ -322,7 +322,7 @@
     sub             r3, #32
     mov             r12, #16
 
-.loop_48x64:
+.Loop_48x64:
 
     subs            r12, #1
 .rept 4
@@ -337,7 +337,7 @@
     vabal.u8        q14, d4, d20
     vabal.u8        q15, d5, d21
 .endr
-    bne             .loop_48x64
+    bne             .Loop_48x64
 
     vadd.u16        q3, q3, q11
     vadd.u16        d6, d6, d7
@@ -635,12 +635,12 @@
     veor.u8         q15, q15
 .endif
 
-.loop_sad_x\x\()_16x\h:
+.Loop_sad_x\x\()_16x\h:
 .rept 8
     SAD_X_16 \x
 .endr
     subs            r6, #1
-    bne             .loop_sad_x\x\()_16x\h
+    bne             .Loop_sad_x\x\()_16x\h
 
     vadd.u16        q8, q8, q9
     vadd.u16        q10, q10, q11
@@ -929,12 +929,12 @@
     veor.u8         q14, q14
     veor.u8         q15, q15
 .endif
-.loop_sad_x\x\()_64x\h:
+.Loop_sad_x\x\()_64x\h:
 .rept 8
     SAD_X_64 \x
 .endr
     subs            r6, #1
-    bne             .loop_sad_x\x\()_64x\h
+    bne             .Loop_sad_x\x\()_64x\h
 
 .if \h <= 16
     vadd.u16        q8, q8, q9
@@ -1071,12 +1071,12 @@
     veor.u8         q15, q15
 .endif
 
-.loop_sad_x\x\()_48x64:
+.Loop_sad_x\x\()_48x64:
 .rept 8
     SAD_X_48 \x
 .endr
     subs            r6, #1
-    bne             .loop_sad_x\x\()_48x64
+    bne             .Loop_sad_x\x\()_48x64
 
     vpaddl.u16      q8, q8
     vpaddl.u16      q9, q9
@@ -1179,12 +1179,12 @@
     veor.u8         q15, q15
 .endif
 
-.loop_sad_x\x\()_24x32:
+.Loop_sad_x\x\()_24x32:
 .rept 8
     SAD_X_24 \x
 .endr
     subs            r6, #1
-    bne             .loop_sad_x\x\()_24x32
+    bne             .Loop_sad_x\x\()_24x32
 
     vadd.u16        q8, q8, q9
     vadd.u16        q10, q10, q11

 
@@ -103,7 +103,7 @@
     vabal.u8        q9, d5, d7
     mov             r12, #(\h-2)/2
 
-.loop_16x\h:
+.Loop_16x\h:
 
     subs            r12, #1
     vld1.8          {q0}, r0, r1
@@ -115,7 +115,7 @@
     vabal.u8        q9, d1, d3
     vabal.u8        q8, d4, d6
     vabal.u8        q9, d5, d7
-    bne             .loop_16x\h
+    bne             .Loop_16x\h
 
     vadd.u16        q8, q8, q9
 .if \h == 64
@@ -147,7 +147,7 @@
     veor.u8         q11, q11
     mov             r12, #\h/8
 
-.loop_32x\h:
+.Loop_32x\h:
 
     subs            r12, #1
 .rept 4
@@ -166,7 +166,7 @@
     vabal.u8        q10, d26, d30
     vabal.u8        q11, d27, d31
 .endr
-    bne             .loop_32x\h
+    bne             .Loop_32x\h
 
     vadd.u16        q8, q8, q9
     vadd.u16        q10, q10, q11
@@ -213,7 +213,7 @@
     sub             r3, r12
     mov             r12, #\h/8
 
-.loop_64x\h:
+.Loop_64x\h:
 
     subs            r12, #1
 .rept 4
@@ -246,7 +246,7 @@
     vabal.u8        q10, d26, d30
     vabal.u8        q11, d27, d31
 .endr
-    bne             .loop_64x\h
+    bne             .Loop_64x\h
 
     vadd.u16        q8, q8, q9
     vadd.u16        q10, q10, q11
@@ -283,7 +283,7 @@
     sub             r3, #16
     mov             r12, #8
 
-.loop_24x32:
+.Loop_24x32:
 
     subs            r12, #1
 .rept 4
@@ -296,7 +296,7 @@
     vld1.8          {d1}, r2, r3
     vabal.u8        q10, d0, d1
 .endr
-    bne             .loop_24x32
+    bne             .Loop_24x32
 
     vadd.u16        q8, q8, q9
     vadd.u16        d16, d16, d17
@@ -322,7 +322,7 @@
     sub             r3, #32
     mov             r12, #16
 
-.loop_48x64:
+.Loop_48x64:
 
     subs            r12, #1
 .rept 4
@@ -337,7 +337,7 @@
     vabal.u8        q14, d4, d20
     vabal.u8        q15, d5, d21
 .endr
-    bne             .loop_48x64
+    bne             .Loop_48x64
 
     vadd.u16        q3, q3, q11
     vadd.u16        d6, d6, d7
@@ -635,12 +635,12 @@
     veor.u8         q15, q15
 .endif
 
-.loop_sad_x\x\()_16x\h:
+.Loop_sad_x\x\()_16x\h:
 .rept 8
     SAD_X_16 \x
 .endr
     subs            r6, #1
-    bne             .loop_sad_x\x\()_16x\h
+    bne             .Loop_sad_x\x\()_16x\h
 
     vadd.u16        q8, q8, q9
     vadd.u16        q10, q10, q11
@@ -929,12 +929,12 @@
     veor.u8         q14, q14
     veor.u8         q15, q15
 .endif
-.loop_sad_x\x\()_64x\h:
+.Loop_sad_x\x\()_64x\h:
 .rept 8
     SAD_X_64 \x
 .endr
     subs            r6, #1
-    bne             .loop_sad_x\x\()_64x\h
+    bne             .Loop_sad_x\x\()_64x\h
 
 .if \h <= 16
     vadd.u16        q8, q8, q9
@@ -1071,12 +1071,12 @@
     veor.u8         q15, q15
 .endif
 
-.loop_sad_x\x\()_48x64:
+.Loop_sad_x\x\()_48x64:
 .rept 8
     SAD_X_48 \x
 .endr
     subs            r6, #1
-    bne             .loop_sad_x\x\()_48x64
+    bne             .Loop_sad_x\x\()_48x64
 
     vpaddl.u16      q8, q8
     vpaddl.u16      q9, q9
@@ -1179,12 +1179,12 @@
     veor.u8         q15, q15
 .endif
 
-.loop_sad_x\x\()_24x32:
+.Loop_sad_x\x\()_24x32:
 .rept 8
     SAD_X_24 \x
 .endr
     subs            r6, #1
-    bne             .loop_sad_x\x\()_24x32
+    bne             .Loop_sad_x\x\()_24x32
 
     vadd.u16        q8, q8, q9
     vadd.u16        q10, q10, q11
​

x265_3.6.tar.gz/source/common/arm/ssd-a.S -> x265_4.0.tar.gz/source/common/arm/ssd-a.S Changed

@@ -121,7 +121,7 @@
     veor.u8     q0, q0
     veor.u8     q1, q1
 
-.loop_sse_pp_32:
+.Loop_sse_pp_32:
     subs        r12, #1
 .rept 4
     vld1.64     {q8-q9}, r0, r1
@@ -139,7 +139,7 @@
     vmlal.s16   q0, d26, d26
     vmlal.s16   q1, d27, d27
 .endr
-    bne         .loop_sse_pp_32
+    bne         .Loop_sse_pp_32
     vadd.s32    q0, q1
     vadd.s32    d0, d0, d1
     vpadd.s32   d0, d0, d0
@@ -154,7 +154,7 @@
     veor.u8     q0, q0
     veor.u8     q1, q1
 
-.loop_sse_pp_64:
+.Loop_sse_pp_64:
     subs        r12, #1
 .rept 4
     vld1.64     {q8-q9}, r0!
@@ -187,7 +187,7 @@
     vmlal.s16   q0, d26, d26
     vmlal.s16   q1, d27, d27
 .endr
-    bne         .loop_sse_pp_64
+    bne         .Loop_sse_pp_64
     vadd.s32    q0, q1
     vadd.s32    d0, d0, d1
     vpadd.s32   d0, d0, d0
@@ -257,7 +257,7 @@
     veor.u8     q0, q0
     veor.u8     q1, q1
 
-.loop_sse_ss_16:
+.Loop_sse_ss_16:
     subs        r12, #1
 .rept 4
     vld1.s16    {q8-q9}, r0, r1
@@ -269,7 +269,7 @@
     vmlal.s16   q0, d18, d18
     vmlal.s16   q1, d19, d19
 .endr
-    bne         .loop_sse_ss_16
+    bne         .Loop_sse_ss_16
     vadd.s32    q0, q1
     vadd.s32    d0, d0, d1
     vpadd.s32   d0, d0, d0
@@ -286,7 +286,7 @@
     veor.u8     q0, q0
     veor.u8     q1, q1
 
-.loop_sse_ss_32:
+.Loop_sse_ss_32:
     subs        r12, #1
 .rept 4
     vld1.s16    {q8-q9}, r0!
@@ -307,7 +307,7 @@
     vmlal.s16   q0, d18, d18
     vmlal.s16   q1, d19, d19
 .endr
-    bne         .loop_sse_ss_32
+    bne         .Loop_sse_ss_32
     vadd.s32    q0, q1
     vadd.s32    d0, d0, d1
     vpadd.s32   d0, d0, d0
@@ -324,7 +324,7 @@
     veor.u8     q0, q0
     veor.u8     q1, q1
 
-.loop_sse_ss_64:
+.Loop_sse_ss_64:
     subs        r12, #1
 .rept 2
     vld1.s16    {q8-q9}, r0!
@@ -363,7 +363,7 @@
     vmlal.s16   q0, d18, d18
     vmlal.s16   q1, d19, d19
 .endr
-    bne         .loop_sse_ss_64
+    bne         .Loop_sse_ss_64
     vadd.s32    q0, q1
     vadd.s32    d0, d0, d1
     vpadd.s32   d0, d0, d0
@@ -417,7 +417,7 @@
     veor.u8     q0, q0
     veor.u8     q1, q1
 
-.loop_ssd_s_16:
+.Loop_ssd_s_16:
     subs        r12, #1
 .rept 2
     vld1.s16    {q8-q9}, r0, r1
@@ -431,7 +431,7 @@
     vmlal.s16   q0, d22, d22
     vmlal.s16   q1, d23, d23
 .endr
-    bne         .loop_ssd_s_16
+    bne         .Loop_ssd_s_16
     vadd.s32    q0, q1
     vadd.s32    d0, d0, d1
     vpadd.s32   d0, d0, d0
@@ -446,7 +446,7 @@
     veor.u8     q0, q0
     veor.u8     q1, q1
 
-.loop_ssd_s_32:
+.Loop_ssd_s_32:
     subs        r12, #1
 .rept 4
     vld1.s16    {q8-q9}, r0!
@@ -460,7 +460,7 @@
     vmlal.s16   q0, d22, d22
     vmlal.s16   q1, d23, d23
 .endr
-    bne         .loop_ssd_s_32
+    bne         .Loop_ssd_s_32
     vadd.s32    q0, q1
     vadd.s32    d0, d0, d1
     vpadd.s32   d0, d0, d0

 
@@ -121,7 +121,7 @@
     veor.u8     q0, q0
     veor.u8     q1, q1
 
-.loop_sse_pp_32:
+.Loop_sse_pp_32:
     subs        r12, #1
 .rept 4
     vld1.64     {q8-q9}, r0, r1
@@ -139,7 +139,7 @@
     vmlal.s16   q0, d26, d26
     vmlal.s16   q1, d27, d27
 .endr
-    bne         .loop_sse_pp_32
+    bne         .Loop_sse_pp_32
     vadd.s32    q0, q1
     vadd.s32    d0, d0, d1
     vpadd.s32   d0, d0, d0
@@ -154,7 +154,7 @@
     veor.u8     q0, q0
     veor.u8     q1, q1
 
-.loop_sse_pp_64:
+.Loop_sse_pp_64:
     subs        r12, #1
 .rept 4
     vld1.64     {q8-q9}, r0!
@@ -187,7 +187,7 @@
     vmlal.s16   q0, d26, d26
     vmlal.s16   q1, d27, d27
 .endr
-    bne         .loop_sse_pp_64
+    bne         .Loop_sse_pp_64
     vadd.s32    q0, q1
     vadd.s32    d0, d0, d1
     vpadd.s32   d0, d0, d0
@@ -257,7 +257,7 @@
     veor.u8     q0, q0
     veor.u8     q1, q1
 
-.loop_sse_ss_16:
+.Loop_sse_ss_16:
     subs        r12, #1
 .rept 4
     vld1.s16    {q8-q9}, r0, r1
@@ -269,7 +269,7 @@
     vmlal.s16   q0, d18, d18
     vmlal.s16   q1, d19, d19
 .endr
-    bne         .loop_sse_ss_16
+    bne         .Loop_sse_ss_16
     vadd.s32    q0, q1
     vadd.s32    d0, d0, d1
     vpadd.s32   d0, d0, d0
@@ -286,7 +286,7 @@
     veor.u8     q0, q0
     veor.u8     q1, q1
 
-.loop_sse_ss_32:
+.Loop_sse_ss_32:
     subs        r12, #1
 .rept 4
     vld1.s16    {q8-q9}, r0!
@@ -307,7 +307,7 @@
     vmlal.s16   q0, d18, d18
     vmlal.s16   q1, d19, d19
 .endr
-    bne         .loop_sse_ss_32
+    bne         .Loop_sse_ss_32
     vadd.s32    q0, q1
     vadd.s32    d0, d0, d1
     vpadd.s32   d0, d0, d0
@@ -324,7 +324,7 @@
     veor.u8     q0, q0
     veor.u8     q1, q1
 
-.loop_sse_ss_64:
+.Loop_sse_ss_64:
     subs        r12, #1
 .rept 2
     vld1.s16    {q8-q9}, r0!
@@ -363,7 +363,7 @@
     vmlal.s16   q0, d18, d18
     vmlal.s16   q1, d19, d19
 .endr
-    bne         .loop_sse_ss_64
+    bne         .Loop_sse_ss_64
     vadd.s32    q0, q1
     vadd.s32    d0, d0, d1
     vpadd.s32   d0, d0, d0
@@ -417,7 +417,7 @@
     veor.u8     q0, q0
     veor.u8     q1, q1
 
-.loop_ssd_s_16:
+.Loop_ssd_s_16:
     subs        r12, #1
 .rept 2
     vld1.s16    {q8-q9}, r0, r1
@@ -431,7 +431,7 @@
     vmlal.s16   q0, d22, d22
     vmlal.s16   q1, d23, d23
 .endr
-    bne         .loop_ssd_s_16
+    bne         .Loop_ssd_s_16
     vadd.s32    q0, q1
     vadd.s32    d0, d0, d1
     vpadd.s32   d0, d0, d0
@@ -446,7 +446,7 @@
     veor.u8     q0, q0
     veor.u8     q1, q1
 
-.loop_ssd_s_32:
+.Loop_ssd_s_32:
     subs        r12, #1
 .rept 4
     vld1.s16    {q8-q9}, r0!
@@ -460,7 +460,7 @@
     vmlal.s16   q0, d22, d22
     vmlal.s16   q1, d23, d23
 .endr
-    bne         .loop_ssd_s_32
+    bne         .Loop_ssd_s_32
     vadd.s32    q0, q1
     vadd.s32    d0, d0, d1
     vpadd.s32   d0, d0, d0
​

x265_3.6.tar.gz/source/common/common.h -> x265_4.0.tar.gz/source/common/common.h Changed

 
@@ -176,6 +176,12 @@
 template<typename T> /* clip to pixel range, 0..255 or 0..1023 */
 inline pixel x265_clip(T x) { return (pixel)x265_min<T>(T((1 << X265_DEPTH) - 1), x265_max<T>(T(0), x)); }
 
+/* get the sign of input variable */
+static inline int8_t x265_signOf(int32_t x)
+{
+    return (x >> 31) | ((int32_t)((((uint32_t) - x)) >> 31));
+}
+
 typedef int16_t  coeff_t;      // transform coefficient
 
 #define X265_MIN(a, b) ((a) < (b) ? (a) : (b))
​

x265_3.6.tar.gz/source/common/cpu.cpp -> x265_4.0.tar.gz/source/common/cpu.cpp Changed

 
@@ -115,6 +115,12 @@
 #if defined(HAVE_SVE2)
     { "SVE2",            X265_CPU_SVE2 },
 #endif
+#if defined(HAVE_NEON_DOTPROD)
+    { "Neon_DotProd",    X265_CPU_NEON_DOTPROD },
+#endif
+#if defined(HAVE_NEON_I8MM)
+    { "Neon_I8MM",       X265_CPU_NEON_I8MM },
+#endif
 #elif X265_ARCH_POWER8
     { "Altivec",         X265_CPU_ALTIVEC },
 
@@ -389,17 +395,22 @@
 {
     int flags = 0;
 
-    #if defined(HAVE_SVE2)
-         flags |= X265_CPU_SVE2;
-         flags |= X265_CPU_SVE;
+    #if HAVE_NEON
          flags |= X265_CPU_NEON;
-    #elif defined(HAVE_SVE)
+    #endif
+    #if HAVE_NEON_DOTPROD
+         flags |= X265_CPU_NEON_DOTPROD;
+    #endif
+    #if HAVE_NEON_I8MM
+         flags |= X265_CPU_NEON_I8MM;
+    #endif
+    #if HAVE_SVE
          flags |= X265_CPU_SVE;
-         flags |= X265_CPU_NEON;
-    #elif HAVE_NEON
-         flags |= X265_CPU_NEON;
     #endif
-        
+    #if HAVE_SVE2
+         flags |= X265_CPU_SVE2;
+    #endif
+
     return flags;
 }
 
​

x265_3.6.tar.gz/source/common/cudata.cpp -> x265_4.0.tar.gz/source/common/cudata.cpp Changed

@@ -290,6 +290,10 @@
     m_bFirstRowInSlice = (uint8_t)firstRowInSlice;
     m_bLastRowInSlice  = (uint8_t)lastRowInSlice;
     m_bLastCuInSlice   = (uint8_t)lastCuInSlice;
+#if ENABLE_SCC_EXT
+    m_lastIntraBCMv0.set(0, 0);
+    m_lastIntraBCMv1.set(0, 0);
+#endif
 
     /* sequential memsets */
     m_partSet((uint8_t*)m_qp, (uint8_t)qp);
@@ -323,7 +327,7 @@
 }
 
 // initialize Sub partition
-void CUData::initSubCU(const CUData& ctu, const CUGeom& cuGeom, int qp)
+void CUData::initSubCU(const CUData& ctu, const CUGeom& cuGeom, int qp, MV lastIntraBCMv2)
 {
     m_absIdxInCTU   = cuGeom.absPartIdx;
     m_encData       = ctu.m_encData;
@@ -360,6 +364,14 @@
     /* initialize the remaining CU data in one memset */
     memset(m_predMode, 0, (ctu.m_chromaFormat == X265_CSP_I400 ? BytesPerPartition - 13 : BytesPerPartition - 9) * m_numPartitions);
     memset(m_distortion, 0, m_numPartitions * sizeof(sse_t));
+
+#if ENABLE_SCC_EXT
+    if (lastIntraBCMv)
+    {
+        for (int i = 0; i < 2; i++)
+            m_lastIntraBCMvi = lastIntraBCMvi;
+    }
+#endif
 }
 
 /* Copy the results of a sub-part (split) CU to the parent CU */
@@ -415,6 +427,10 @@
         memcpy(m_trCoeff1 + tmpC2, subCU.m_trCoeff1, sizeof(coeff_t) * tmpC);
         memcpy(m_trCoeff2 + tmpC2, subCU.m_trCoeff2, sizeof(coeff_t) * tmpC);
     }
+#if ENABLE_SCC_EXT
+    for (int i = 0; i < 2; i++)
+        m_lastIntraBCMvi = subCU.m_lastIntraBCMvi;
+#endif
 }
 
 /* If a sub-CU part is not present (off the edge of the picture) its depth and
@@ -1591,7 +1607,11 @@
                 return maxNumMergeCand;
         }
     }
+#if ENABLE_SCC_EXT
+    if (m_slice->m_bTemporalMvp)
+#else
     if (m_slice->m_sps->bTemporalMVPEnabled)
+#endif
     {
         uint32_t partIdxRB = deriveRightBottomIdx(puIdx);
         MV colmv;
@@ -1681,10 +1701,15 @@
             }
         }
     }
-    int numRefIdx = (isInterB) ? X265_MIN(m_slice->m_numRefIdx0, m_slice->m_numRefIdx1) : m_slice->m_numRefIdx0;
+    int numRefIdx0 = m_slice->m_numRefIdx0;
+#if ENABLE_SCC_EXT
+    if (m_slice->m_param->bEnableSCC)
+        numRefIdx0--;
+#endif
+    int numRefIdx = (isInterB) ? X265_MIN(numRefIdx0, m_slice->m_numRefIdx1) : numRefIdx0;
     int r = 0;
     int refcnt = 0;
-    while (count < maxNumMergeCand)
+    while (numRefIdx && (count < maxNumMergeCand))
     {
         candDircount = 1;
         candMvFieldcount0.mv.word = 0;
@@ -1712,28 +1737,61 @@
 }
 
 // Create the PMV list. Called for each reference index.
-int CUData::getPMV(InterNeighbourMV *neighbours, uint32_t picList, uint32_t refIdx, MV* amvpCand, MV* pmv) const
+int CUData::getPMV(InterNeighbourMV* neighbours, uint32_t picList, uint32_t refIdx, MV* amvpCand, MV* pmv, uint32_t puIdx, uint32_t absPartIdx) const
 {
     MV directMVMD_ABOVE_LEFT + 1;
     MV indirectMVMD_ABOVE_LEFT + 1;
     bool validDirectMD_ABOVE_LEFT + 1;
     bool validIndirectMD_ABOVE_LEFT + 1;
 
-    // Left candidate.
-    validDirectMD_BELOW_LEFT  = getDirectPMV(directMVMD_BELOW_LEFT, neighbours + MD_BELOW_LEFT, picList, refIdx);
-    validDirectMD_LEFT        = getDirectPMV(directMVMD_LEFT, neighbours + MD_LEFT, picList, refIdx);
-    // Top candidate.
-    validDirectMD_ABOVE_RIGHT = getDirectPMV(directMVMD_ABOVE_RIGHT, neighbours + MD_ABOVE_RIGHT, picList, refIdx);
-    validDirectMD_ABOVE       = getDirectPMV(directMVMD_ABOVE, neighbours + MD_ABOVE, picList, refIdx);
-    validDirectMD_ABOVE_LEFT  = getDirectPMV(directMVMD_ABOVE_LEFT, neighbours + MD_ABOVE_LEFT, picList, refIdx);
-
-    // Left candidate.
-    validIndirectMD_BELOW_LEFT  = getIndirectPMV(indirectMVMD_BELOW_LEFT, neighbours + MD_BELOW_LEFT, picList, refIdx);
-    validIndirectMD_LEFT        = getIndirectPMV(indirectMVMD_LEFT, neighbours + MD_LEFT, picList, refIdx);
-    // Top candidate.
-    validIndirectMD_ABOVE_RIGHT = getIndirectPMV(indirectMVMD_ABOVE_RIGHT, neighbours + MD_ABOVE_RIGHT, picList, refIdx);
-    validIndirectMD_ABOVE       = getIndirectPMV(indirectMVMD_ABOVE, neighbours + MD_ABOVE, picList, refIdx);
-    validIndirectMD_ABOVE_LEFT  = getIndirectPMV(indirectMVMD_ABOVE_LEFT, neighbours + MD_ABOVE_LEFT, picList, refIdx);
+#if (ENABLE_MULTIVIEW || ENABLE_SCC_EXT)
+    if (m_slice->m_param->numViews > 1 || m_slice->m_param->bEnableSCC)
+    {
+        // Left candidate.
+        if ((neighbours + MD_BELOW_LEFT)->isAvailable || (neighbours + MD_LEFT)->isAvailable)
+        {
+            validIndirectMD_ABOVE_RIGHT = validIndirectMD_ABOVE = validIndirectMD_ABOVE_LEFT = false;
+
+            validDirectMD_BELOW_LEFT = getDirectPMV(directMVMD_BELOW_LEFT, neighbours + MD_BELOW_LEFT, picList, refIdx);
+            validDirectMD_LEFT = getDirectPMV(directMVMD_LEFT, neighbours + MD_LEFT, picList, refIdx);
+
+            validIndirectMD_BELOW_LEFT = getIndirectPMV(indirectMVMD_BELOW_LEFT, neighbours + MD_BELOW_LEFT, picList, refIdx);
+            validIndirectMD_LEFT = getIndirectPMV(indirectMVMD_LEFT, neighbours + MD_LEFT, picList, refIdx);
+        }
+
+        // Top candidate.
+        validDirectMD_ABOVE_RIGHT = getDirectPMV(directMVMD_ABOVE_RIGHT, neighbours + MD_ABOVE_RIGHT, picList, refIdx);
+        validDirectMD_ABOVE = getDirectPMV(directMVMD_ABOVE, neighbours + MD_ABOVE, picList, refIdx);
+        validDirectMD_ABOVE_LEFT = getDirectPMV(directMVMD_ABOVE_LEFT, neighbours + MD_ABOVE_LEFT, picList, refIdx);
+
+        // Top candidate.
+        if (!((neighbours + MD_BELOW_LEFT)->isAvailable || (neighbours + MD_LEFT)->isAvailable))
+        {
+            validDirectMD_BELOW_LEFT = validDirectMD_LEFT = validIndirectMD_BELOW_LEFT = validIndirectMD_LEFT = false;
+            validIndirectMD_ABOVE_RIGHT = getIndirectPMV(indirectMVMD_ABOVE_RIGHT, neighbours + MD_ABOVE_RIGHT, picList, refIdx);
+            validIndirectMD_ABOVE = getIndirectPMV(indirectMVMD_ABOVE, neighbours + MD_ABOVE, picList, refIdx);
+            validIndirectMD_ABOVE_LEFT = getIndirectPMV(indirectMVMD_ABOVE_LEFT, neighbours + MD_ABOVE_LEFT, picList, refIdx);
+        }
+    }
+    else
+#endif
+    {
+        // Left candidate.
+        validDirectMD_BELOW_LEFT = getDirectPMV(directMVMD_BELOW_LEFT, neighbours + MD_BELOW_LEFT, picList, refIdx);
+        validDirectMD_LEFT = getDirectPMV(directMVMD_LEFT, neighbours + MD_LEFT, picList, refIdx);
+        // Top candidate.
+        validDirectMD_ABOVE_RIGHT = getDirectPMV(directMVMD_ABOVE_RIGHT, neighbours + MD_ABOVE_RIGHT, picList, refIdx);
+        validDirectMD_ABOVE = getDirectPMV(directMVMD_ABOVE, neighbours + MD_ABOVE, picList, refIdx);
+        validDirectMD_ABOVE_LEFT = getDirectPMV(directMVMD_ABOVE_LEFT, neighbours + MD_ABOVE_LEFT, picList, refIdx);
+
+        // Left candidate.
+        validIndirectMD_BELOW_LEFT = getIndirectPMV(indirectMVMD_BELOW_LEFT, neighbours + MD_BELOW_LEFT, picList, refIdx);
+        validIndirectMD_LEFT = getIndirectPMV(indirectMVMD_LEFT, neighbours + MD_LEFT, picList, refIdx);
+        // Top candidate.
+        validIndirectMD_ABOVE_RIGHT = getIndirectPMV(indirectMVMD_ABOVE_RIGHT, neighbours + MD_ABOVE_RIGHT, picList, refIdx);
+        validIndirectMD_ABOVE = getIndirectPMV(indirectMVMD_ABOVE, neighbours + MD_ABOVE, picList, refIdx);
+        validIndirectMD_ABOVE_LEFT = getIndirectPMV(indirectMVMD_ABOVE_LEFT, neighbours + MD_ABOVE_LEFT, picList, refIdx);
+    }
 
     int num = 0;
     // Left predictor search
@@ -1781,27 +1839,80 @@
 
     // Get the collocated candidate. At this step, either the first candidate
     // was found or its value is 0.
-    if (m_slice->m_sps->bTemporalMVPEnabled && num < 2)
+#if ENABLE_MULTIVIEW || ENABLE_SCC_EXT
+    if (m_slice->m_param->numViews > 1 || m_slice->m_param->bEnableSCC)
     {
-        int tempRefIdx = neighboursMD_COLLOCATED.refIdxpicList;
-        if (tempRefIdx != -1)
+        if (m_slice->m_bTemporalMvp && num < 2)
         {
-            uint32_t cuAddr = neighboursMD_COLLOCATED.cuAddrpicList;
-            const Frame* colPic = m_slice->m_refFrameListm_slice->isInterB() && !m_slice->m_colFromL0Flagm_slice->m_colRefIdx;
-            const CUData* colCU = colPic->m_encData->getPicCTU(cuAddr);
+            int refId = refIdx;
+            uint32_t absPartAddr = m_absIdxInCTU + absPartIdx;
+            uint32_t partIdxRB = deriveRightBottomIdx(puIdx);
+            bool isValid;
+
+            // co-located RightBottom temporal predictor (H)
+            int ctuIdx = -1;
 
-            // Scale the vector
-            int colRefPOC = colCU->m_slice->m_refPOCListtempRefIdx >> 4tempRefIdx & 0xf;
-            int colPOC = colCU->m_slice->m_poc;
+            // image boundary check
+            if (m_encData->getPicCTU(m_cuAddr)->m_cuPelX + g_zscanToPelXpartIdxRB + UNIT_SIZE < m_slice->m_sps->picWidthInLumaSamples &&
+                m_encData->getPicCTU(m_cuAddr)->m_cuPelY + g_zscanToPelYpartIdxRB + UNIT_SIZE < m_slice->m_sps->picHeightInLumaSamples)
+            {
+                uint32_t absPartIdxRB = g_zscanToRasterpartIdxRB;
+                uint32_t numUnits = s_numPartInCUSize;
+                bool bNotLastCol = lessThanCol(absPartIdxRB, numUnits - 1); // is not at the last column of CTU
+                bool bNotLastRow = lessThanRow(absPartIdxRB, numUnits - 1); // is not at the last row    of CTU
 
-            int curRefPOC = m_slice->m_refPOCListpicListrefIdx;
-            int curPOC = m_slice->m_poc;
-            pmvnumMvc++ = amvpCandnum++ = scaleMvByPOCDist(neighboursMD_COLLOCATED.mvpicList, curPOC, curRefPOC, colPOC, colRefPOC);
+                if (bNotLastCol && bNotLastRow)
+                {
+                    absPartAddr = g_rasterToZscanabsPartIdxRB + RASTER_SIZE + 1;
+                    ctuIdx = m_cuAddr;
+                }
+                else if (bNotLastCol)
+                    absPartAddr = g_rasterToZscan(absPartIdxRB + 1) & (numUnits - 1);
+                else if (bNotLastRow)

 
@@ -290,6 +290,10 @@
     m_bFirstRowInSlice = (uint8_t)firstRowInSlice;
     m_bLastRowInSlice  = (uint8_t)lastRowInSlice;
     m_bLastCuInSlice   = (uint8_t)lastCuInSlice;
+#if ENABLE_SCC_EXT
+    m_lastIntraBCMv0.set(0, 0);
+    m_lastIntraBCMv1.set(0, 0);
+#endif
 
     /* sequential memsets */
     m_partSet((uint8_t*)m_qp, (uint8_t)qp);
@@ -323,7 +327,7 @@
 }
 
 // initialize Sub partition
-void CUData::initSubCU(const CUData& ctu, const CUGeom& cuGeom, int qp)
+void CUData::initSubCU(const CUData& ctu, const CUGeom& cuGeom, int qp, MV lastIntraBCMv2)
 {
     m_absIdxInCTU   = cuGeom.absPartIdx;
     m_encData       = ctu.m_encData;
@@ -360,6 +364,14 @@
     /* initialize the remaining CU data in one memset */
     memset(m_predMode, 0, (ctu.m_chromaFormat == X265_CSP_I400 ? BytesPerPartition - 13 : BytesPerPartition - 9) * m_numPartitions);
     memset(m_distortion, 0, m_numPartitions * sizeof(sse_t));
+
+#if ENABLE_SCC_EXT
+    if (lastIntraBCMv)
+    {
+        for (int i = 0; i < 2; i++)
+            m_lastIntraBCMvi = lastIntraBCMvi;
+    }
+#endif
 }
 
 /* Copy the results of a sub-part (split) CU to the parent CU */
@@ -415,6 +427,10 @@
         memcpy(m_trCoeff1 + tmpC2, subCU.m_trCoeff1, sizeof(coeff_t) * tmpC);
         memcpy(m_trCoeff2 + tmpC2, subCU.m_trCoeff2, sizeof(coeff_t) * tmpC);
     }
+#if ENABLE_SCC_EXT
+    for (int i = 0; i < 2; i++)
+        m_lastIntraBCMvi = subCU.m_lastIntraBCMvi;
+#endif
 }
 
 /* If a sub-CU part is not present (off the edge of the picture) its depth and
@@ -1591,7 +1607,11 @@
                 return maxNumMergeCand;
         }
     }
+#if ENABLE_SCC_EXT
+    if (m_slice->m_bTemporalMvp)
+#else
     if (m_slice->m_sps->bTemporalMVPEnabled)
+#endif
     {
         uint32_t partIdxRB = deriveRightBottomIdx(puIdx);
         MV colmv;
@@ -1681,10 +1701,15 @@
             }
         }
     }
-    int numRefIdx = (isInterB) ? X265_MIN(m_slice->m_numRefIdx0, m_slice->m_numRefIdx1) : m_slice->m_numRefIdx0;
+    int numRefIdx0 = m_slice->m_numRefIdx0;
+#if ENABLE_SCC_EXT
+    if (m_slice->m_param->bEnableSCC)
+        numRefIdx0--;
+#endif
+    int numRefIdx = (isInterB) ? X265_MIN(numRefIdx0, m_slice->m_numRefIdx1) : numRefIdx0;
     int r = 0;
     int refcnt = 0;
-    while (count < maxNumMergeCand)
+    while (numRefIdx && (count < maxNumMergeCand))
     {
         candDircount = 1;
         candMvFieldcount0.mv.word = 0;
@@ -1712,28 +1737,61 @@
 }
 
 // Create the PMV list. Called for each reference index.
-int CUData::getPMV(InterNeighbourMV *neighbours, uint32_t picList, uint32_t refIdx, MV* amvpCand, MV* pmv) const
+int CUData::getPMV(InterNeighbourMV* neighbours, uint32_t picList, uint32_t refIdx, MV* amvpCand, MV* pmv, uint32_t puIdx, uint32_t absPartIdx) const
 {
     MV directMVMD_ABOVE_LEFT + 1;
     MV indirectMVMD_ABOVE_LEFT + 1;
     bool validDirectMD_ABOVE_LEFT + 1;
     bool validIndirectMD_ABOVE_LEFT + 1;
 
-    // Left candidate.
-    validDirectMD_BELOW_LEFT  = getDirectPMV(directMVMD_BELOW_LEFT, neighbours + MD_BELOW_LEFT, picList, refIdx);
-    validDirectMD_LEFT        = getDirectPMV(directMVMD_LEFT, neighbours + MD_LEFT, picList, refIdx);
-    // Top candidate.
-    validDirectMD_ABOVE_RIGHT = getDirectPMV(directMVMD_ABOVE_RIGHT, neighbours + MD_ABOVE_RIGHT, picList, refIdx);
-    validDirectMD_ABOVE       = getDirectPMV(directMVMD_ABOVE, neighbours + MD_ABOVE, picList, refIdx);
-    validDirectMD_ABOVE_LEFT  = getDirectPMV(directMVMD_ABOVE_LEFT, neighbours + MD_ABOVE_LEFT, picList, refIdx);
-
-    // Left candidate.
-    validIndirectMD_BELOW_LEFT  = getIndirectPMV(indirectMVMD_BELOW_LEFT, neighbours + MD_BELOW_LEFT, picList, refIdx);
-    validIndirectMD_LEFT        = getIndirectPMV(indirectMVMD_LEFT, neighbours + MD_LEFT, picList, refIdx);
-    // Top candidate.
-    validIndirectMD_ABOVE_RIGHT = getIndirectPMV(indirectMVMD_ABOVE_RIGHT, neighbours + MD_ABOVE_RIGHT, picList, refIdx);
-    validIndirectMD_ABOVE       = getIndirectPMV(indirectMVMD_ABOVE, neighbours + MD_ABOVE, picList, refIdx);
-    validIndirectMD_ABOVE_LEFT  = getIndirectPMV(indirectMVMD_ABOVE_LEFT, neighbours + MD_ABOVE_LEFT, picList, refIdx);
+#if (ENABLE_MULTIVIEW || ENABLE_SCC_EXT)
+    if (m_slice->m_param->numViews > 1 || m_slice->m_param->bEnableSCC)
+    {
+        // Left candidate.
+        if ((neighbours + MD_BELOW_LEFT)->isAvailable || (neighbours + MD_LEFT)->isAvailable)
+        {
+            validIndirectMD_ABOVE_RIGHT = validIndirectMD_ABOVE = validIndirectMD_ABOVE_LEFT = false;
+
+            validDirectMD_BELOW_LEFT = getDirectPMV(directMVMD_BELOW_LEFT, neighbours + MD_BELOW_LEFT, picList, refIdx);
+            validDirectMD_LEFT = getDirectPMV(directMVMD_LEFT, neighbours + MD_LEFT, picList, refIdx);
+
+            validIndirectMD_BELOW_LEFT = getIndirectPMV(indirectMVMD_BELOW_LEFT, neighbours + MD_BELOW_LEFT, picList, refIdx);
+            validIndirectMD_LEFT = getIndirectPMV(indirectMVMD_LEFT, neighbours + MD_LEFT, picList, refIdx);
+        }
+
+        // Top candidate.
+        validDirectMD_ABOVE_RIGHT = getDirectPMV(directMVMD_ABOVE_RIGHT, neighbours + MD_ABOVE_RIGHT, picList, refIdx);
+        validDirectMD_ABOVE = getDirectPMV(directMVMD_ABOVE, neighbours + MD_ABOVE, picList, refIdx);
+        validDirectMD_ABOVE_LEFT = getDirectPMV(directMVMD_ABOVE_LEFT, neighbours + MD_ABOVE_LEFT, picList, refIdx);
+
+        // Top candidate.
+        if (!((neighbours + MD_BELOW_LEFT)->isAvailable || (neighbours + MD_LEFT)->isAvailable))
+        {
+            validDirectMD_BELOW_LEFT = validDirectMD_LEFT = validIndirectMD_BELOW_LEFT = validIndirectMD_LEFT = false;
+            validIndirectMD_ABOVE_RIGHT = getIndirectPMV(indirectMVMD_ABOVE_RIGHT, neighbours + MD_ABOVE_RIGHT, picList, refIdx);
+            validIndirectMD_ABOVE = getIndirectPMV(indirectMVMD_ABOVE, neighbours + MD_ABOVE, picList, refIdx);
+            validIndirectMD_ABOVE_LEFT = getIndirectPMV(indirectMVMD_ABOVE_LEFT, neighbours + MD_ABOVE_LEFT, picList, refIdx);
+        }
+    }
+    else
+#endif
+    {
+        // Left candidate.
+        validDirectMD_BELOW_LEFT = getDirectPMV(directMVMD_BELOW_LEFT, neighbours + MD_BELOW_LEFT, picList, refIdx);
+        validDirectMD_LEFT = getDirectPMV(directMVMD_LEFT, neighbours + MD_LEFT, picList, refIdx);
+        // Top candidate.
+        validDirectMD_ABOVE_RIGHT = getDirectPMV(directMVMD_ABOVE_RIGHT, neighbours + MD_ABOVE_RIGHT, picList, refIdx);
+        validDirectMD_ABOVE = getDirectPMV(directMVMD_ABOVE, neighbours + MD_ABOVE, picList, refIdx);
+        validDirectMD_ABOVE_LEFT = getDirectPMV(directMVMD_ABOVE_LEFT, neighbours + MD_ABOVE_LEFT, picList, refIdx);
+
+        // Left candidate.
+        validIndirectMD_BELOW_LEFT = getIndirectPMV(indirectMVMD_BELOW_LEFT, neighbours + MD_BELOW_LEFT, picList, refIdx);
+        validIndirectMD_LEFT = getIndirectPMV(indirectMVMD_LEFT, neighbours + MD_LEFT, picList, refIdx);
+        // Top candidate.
+        validIndirectMD_ABOVE_RIGHT = getIndirectPMV(indirectMVMD_ABOVE_RIGHT, neighbours + MD_ABOVE_RIGHT, picList, refIdx);
+        validIndirectMD_ABOVE = getIndirectPMV(indirectMVMD_ABOVE, neighbours + MD_ABOVE, picList, refIdx);
+        validIndirectMD_ABOVE_LEFT = getIndirectPMV(indirectMVMD_ABOVE_LEFT, neighbours + MD_ABOVE_LEFT, picList, refIdx);
+    }
 
     int num = 0;
     // Left predictor search
@@ -1781,27 +1839,80 @@
 
     // Get the collocated candidate. At this step, either the first candidate
     // was found or its value is 0.
-    if (m_slice->m_sps->bTemporalMVPEnabled && num < 2)
+#if ENABLE_MULTIVIEW || ENABLE_SCC_EXT
+    if (m_slice->m_param->numViews > 1 || m_slice->m_param->bEnableSCC)
     {
-        int tempRefIdx = neighboursMD_COLLOCATED.refIdxpicList;
-        if (tempRefIdx != -1)
+        if (m_slice->m_bTemporalMvp && num < 2)
         {
-            uint32_t cuAddr = neighboursMD_COLLOCATED.cuAddrpicList;
-            const Frame* colPic = m_slice->m_refFrameListm_slice->isInterB() && !m_slice->m_colFromL0Flagm_slice->m_colRefIdx;
-            const CUData* colCU = colPic->m_encData->getPicCTU(cuAddr);
+            int refId = refIdx;
+            uint32_t absPartAddr = m_absIdxInCTU + absPartIdx;
+            uint32_t partIdxRB = deriveRightBottomIdx(puIdx);
+            bool isValid;
+
+            // co-located RightBottom temporal predictor (H)
+            int ctuIdx = -1;
 
-            // Scale the vector
-            int colRefPOC = colCU->m_slice->m_refPOCListtempRefIdx >> 4tempRefIdx & 0xf;
-            int colPOC = colCU->m_slice->m_poc;
+            // image boundary check
+            if (m_encData->getPicCTU(m_cuAddr)->m_cuPelX + g_zscanToPelXpartIdxRB + UNIT_SIZE < m_slice->m_sps->picWidthInLumaSamples &&
+                m_encData->getPicCTU(m_cuAddr)->m_cuPelY + g_zscanToPelYpartIdxRB + UNIT_SIZE < m_slice->m_sps->picHeightInLumaSamples)
+            {
+                uint32_t absPartIdxRB = g_zscanToRasterpartIdxRB;
+                uint32_t numUnits = s_numPartInCUSize;
+                bool bNotLastCol = lessThanCol(absPartIdxRB, numUnits - 1); // is not at the last column of CTU
+                bool bNotLastRow = lessThanRow(absPartIdxRB, numUnits - 1); // is not at the last row    of CTU
 
-            int curRefPOC = m_slice->m_refPOCListpicListrefIdx;
-            int curPOC = m_slice->m_poc;
-            pmvnumMvc++ = amvpCandnum++ = scaleMvByPOCDist(neighboursMD_COLLOCATED.mvpicList, curPOC, curRefPOC, colPOC, colRefPOC);
+                if (bNotLastCol && bNotLastRow)
+                {
+                    absPartAddr = g_rasterToZscanabsPartIdxRB + RASTER_SIZE + 1;
+                    ctuIdx = m_cuAddr;
+                }
+                else if (bNotLastCol)
+                    absPartAddr = g_rasterToZscan(absPartIdxRB + 1) & (numUnits - 1);
+                else if (bNotLastRow)
​

x265_3.6.tar.gz/source/common/cudata.h -> x265_4.0.tar.gz/source/common/cudata.h Changed

@@ -37,6 +37,9 @@
 class Slice;
 struct TUEntropyCodingParameters;
 struct CUDataMemPool;
+#if ENABLE_SCC_EXT
+struct IBC;
+#endif
 
 enum PartSize
 {
@@ -107,6 +110,8 @@
     // Collocated right bottom CU addr.
     uint32_t cuAddr2;
 
+    bool isAvailable;
+
     // For spatial prediction, this field contains the reference index
     // in each list (-1 if not available).
     //
@@ -118,6 +123,14 @@
     union { int16_t refIdx2; int32_t unifiedRef; };
 };
 
+struct IBC
+{
+    int             m_numBVs;
+    int             m_numBV16s;
+    MV              m_BVs64;
+    MV              m_lastIntraBCMv2;
+};
+
 typedef void(*cucopy_t)(uint8_t* dst, uint8_t* src); // dst and src are aligned to MIN(size, 32)
 typedef void(*cubcast_t)(uint8_t* dst, uint8_t val); // dst is aligned to MIN(size, 32)
 
@@ -230,13 +243,17 @@
     uint32_t*       m_collectCUVariance;
     uint32_t*       m_collectCUCount;
 
+#if ENABLE_SCC_EXT
+    MV              m_lastIntraBCMv2;
+#endif
+
     CUData();
 
     void     initialize(const CUDataMemPool& dataPool, uint32_t depth, const x265_param& param, int instance);
     static void calcCTUGeoms(uint32_t ctuWidth, uint32_t ctuHeight, uint32_t maxCUSize, uint32_t minCUSize, CUGeom cuDataArrayCUGeom::MAX_GEOMS);
 
     void     initCTU(const Frame& frame, uint32_t cuAddr, int qp, uint32_t firstRowInSlice, uint32_t lastRowInSlice, uint32_t lastCUInSlice);
-    void     initSubCU(const CUData& ctu, const CUGeom& cuGeom, int qp);
+    void     initSubCU(const CUData& ctu, const CUGeom& cuGeom, int qp, MV lastIntraBCMv2 = 0);
     void     initLosslessCU(const CUData& cu, const CUGeom& cuGeom);
 
     void     copyPartFrom(const CUData& cu, const CUGeom& childGeom, uint32_t subPartIdx);
@@ -272,7 +289,7 @@
     int8_t   getRefQP(uint32_t currAbsIdxInCTU) const;
     uint32_t getInterMergeCandidates(uint32_t absPartIdx, uint32_t puIdx, MVField (*candMvField)2, uint8_t* candDir) const;
     void     clipMv(MV& outMV) const;
-    int      getPMV(InterNeighbourMV *neighbours, uint32_t reference_list, uint32_t refIdx, MV* amvpCand, MV* pmv) const;
+    int      getPMV(InterNeighbourMV* neighbours, uint32_t reference_list, uint32_t refIdx, MV* amvpCand, MV* pmv, uint32_t puIdx = 0, uint32_t absPartIdx = 0) const;
     void     getNeighbourMV(uint32_t puIdx, uint32_t absPartIdx, InterNeighbourMV* neighbours) const;
     void     getIntraTUQtDepthRange(uint32_t tuDepthRange2, uint32_t absPartIdx) const;
     void     getInterTUQtDepthRange(uint32_t tuDepthRange2, uint32_t absPartIdx) const;
@@ -309,6 +326,15 @@
     const CUData* getPUAboveRightAdi(uint32_t& arPartUnitIdx, uint32_t curPartUnitIdx, uint32_t partUnitOffset) const;
     const CUData* getPUBelowLeftAdi(uint32_t& blPartUnitIdx, uint32_t curPartUnitIdx, uint32_t partUnitOffset) const;
 
+#if ENABLE_SCC_EXT
+    void getIntraBCMVPsEncOnly(uint32_t absPartIdx, MV* MvPred, int& nbPred, int puIdx);
+    bool getDerivedBV(uint32_t absPartIdx, const MV& currentMv, MV& derivedMv, uint32_t width, uint32_t height);
+    bool isIntraBC(const CUData* cu, uint32_t absPartIdx) const;
+    bool getColMVPIBC(int ctuRsAddr, int partUnitIdx, MV& rcMv);
+    void roundMergeCandidates(MVField(*candMvField)2, int iCount) const;
+    bool is8x8BipredRestriction(MV mvL0, MV mvL1, int iRefIdxL0, int iRefIdxL1) const;
+#endif
+
 protected:
 
     template<typename T>

 
@@ -37,6 +37,9 @@
 class Slice;
 struct TUEntropyCodingParameters;
 struct CUDataMemPool;
+#if ENABLE_SCC_EXT
+struct IBC;
+#endif
 
 enum PartSize
 {
@@ -107,6 +110,8 @@
     // Collocated right bottom CU addr.
     uint32_t cuAddr2;
 
+    bool isAvailable;
+
     // For spatial prediction, this field contains the reference index
     // in each list (-1 if not available).
     //
@@ -118,6 +123,14 @@
     union { int16_t refIdx2; int32_t unifiedRef; };
 };
 
+struct IBC
+{
+    int             m_numBVs;
+    int             m_numBV16s;
+    MV              m_BVs64;
+    MV              m_lastIntraBCMv2;
+};
+
 typedef void(*cucopy_t)(uint8_t* dst, uint8_t* src); // dst and src are aligned to MIN(size, 32)
 typedef void(*cubcast_t)(uint8_t* dst, uint8_t val); // dst is aligned to MIN(size, 32)
 
@@ -230,13 +243,17 @@
     uint32_t*       m_collectCUVariance;
     uint32_t*       m_collectCUCount;
 
+#if ENABLE_SCC_EXT
+    MV              m_lastIntraBCMv2;
+#endif
+
     CUData();
 
     void     initialize(const CUDataMemPool& dataPool, uint32_t depth, const x265_param& param, int instance);
     static void calcCTUGeoms(uint32_t ctuWidth, uint32_t ctuHeight, uint32_t maxCUSize, uint32_t minCUSize, CUGeom cuDataArrayCUGeom::MAX_GEOMS);
 
     void     initCTU(const Frame& frame, uint32_t cuAddr, int qp, uint32_t firstRowInSlice, uint32_t lastRowInSlice, uint32_t lastCUInSlice);
-    void     initSubCU(const CUData& ctu, const CUGeom& cuGeom, int qp);
+    void     initSubCU(const CUData& ctu, const CUGeom& cuGeom, int qp, MV lastIntraBCMv2 = 0);
     void     initLosslessCU(const CUData& cu, const CUGeom& cuGeom);
 
     void     copyPartFrom(const CUData& cu, const CUGeom& childGeom, uint32_t subPartIdx);
@@ -272,7 +289,7 @@
     int8_t   getRefQP(uint32_t currAbsIdxInCTU) const;
     uint32_t getInterMergeCandidates(uint32_t absPartIdx, uint32_t puIdx, MVField (*candMvField)2, uint8_t* candDir) const;
     void     clipMv(MV& outMV) const;
-    int      getPMV(InterNeighbourMV *neighbours, uint32_t reference_list, uint32_t refIdx, MV* amvpCand, MV* pmv) const;
+    int      getPMV(InterNeighbourMV* neighbours, uint32_t reference_list, uint32_t refIdx, MV* amvpCand, MV* pmv, uint32_t puIdx = 0, uint32_t absPartIdx = 0) const;
     void     getNeighbourMV(uint32_t puIdx, uint32_t absPartIdx, InterNeighbourMV* neighbours) const;
     void     getIntraTUQtDepthRange(uint32_t tuDepthRange2, uint32_t absPartIdx) const;
     void     getInterTUQtDepthRange(uint32_t tuDepthRange2, uint32_t absPartIdx) const;
@@ -309,6 +326,15 @@
     const CUData* getPUAboveRightAdi(uint32_t& arPartUnitIdx, uint32_t curPartUnitIdx, uint32_t partUnitOffset) const;
     const CUData* getPUBelowLeftAdi(uint32_t& blPartUnitIdx, uint32_t curPartUnitIdx, uint32_t partUnitOffset) const;
 
+#if ENABLE_SCC_EXT
+    void getIntraBCMVPsEncOnly(uint32_t absPartIdx, MV* MvPred, int& nbPred, int puIdx);
+    bool getDerivedBV(uint32_t absPartIdx, const MV& currentMv, MV& derivedMv, uint32_t width, uint32_t height);
+    bool isIntraBC(const CUData* cu, uint32_t absPartIdx) const;
+    bool getColMVPIBC(int ctuRsAddr, int partUnitIdx, MV& rcMv);
+    void roundMergeCandidates(MVField(*candMvField)2, int iCount) const;
+    bool is8x8BipredRestriction(MV mvL0, MV mvL1, int iRefIdxL0, int iRefIdxL1) const;
+#endif
+
 protected:
 
     template<typename T>
​

x265_3.6.tar.gz/source/common/dct.cpp -> x265_4.0.tar.gz/source/common/dct.cpp Changed

@@ -439,7 +439,8 @@
     }
 }
 
-static void dst4_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
+namespace X265_NS {
+void dst4_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
 {
     const int shift_1st = 1 + X265_DEPTH - 8;
     const int shift_2nd = 8;
@@ -456,7 +457,7 @@
     fastForwardDst(coef, dst, shift_2nd);
 }
 
-static void dct4_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
+void dct4_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
 {
     const int shift_1st = 1 + X265_DEPTH - 8;
     const int shift_2nd = 8;
@@ -473,7 +474,7 @@
     partialButterfly4(coef, dst, shift_2nd, 4);
 }
 
-static void dct8_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
+void dct8_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
 {
     const int shift_1st = 2 + X265_DEPTH - 8;
     const int shift_2nd = 9;
@@ -490,7 +491,7 @@
     partialButterfly8(coef, dst, shift_2nd, 8);
 }
 
-static void dct16_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
+void dct16_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
 {
     const int shift_1st = 3 + X265_DEPTH - 8;
     const int shift_2nd = 10;
@@ -507,7 +508,7 @@
     partialButterfly16(coef, dst, shift_2nd, 16);
 }
 
-static void dct32_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
+void dct32_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
 {
     const int shift_1st = 4 + X265_DEPTH - 8;
     const int shift_2nd = 11;
@@ -524,7 +525,7 @@
     partialButterfly32(coef, dst, shift_2nd, 32);
 }
 
-static void idst4_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
+void idst4_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
 {
     const int shift_1st = 7;
     const int shift_2nd = 12 - (X265_DEPTH - 8);
@@ -541,7 +542,7 @@
     }
 }
 
-static void idct4_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
+void idct4_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
 {
     const int shift_1st = 7;
     const int shift_2nd = 12 - (X265_DEPTH - 8);
@@ -558,7 +559,7 @@
     }
 }
 
-static void idct8_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
+void idct8_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
 {
     const int shift_1st = 7;
     const int shift_2nd = 12 - (X265_DEPTH - 8);
@@ -575,7 +576,7 @@
     }
 }
 
-static void idct16_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
+void idct16_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
 {
     const int shift_1st = 7;
     const int shift_2nd = 12 - (X265_DEPTH - 8);
@@ -592,7 +593,7 @@
     }
 }
 
-static void idct32_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
+void idct32_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
 {
     const int shift_1st = 7;
     const int shift_2nd = 12 - (X265_DEPTH - 8);
@@ -608,6 +609,7 @@
         memcpy(&dsti * dstStride, &blocki * 32, 32 * sizeof(int16_t));
     }
 }
+} // namespace X265_NS
 
 static void dequant_normal_c(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift)
 {

 
@@ -439,7 +439,8 @@
     }
 }
 
-static void dst4_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
+namespace X265_NS {
+void dst4_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
 {
     const int shift_1st = 1 + X265_DEPTH - 8;
     const int shift_2nd = 8;
@@ -456,7 +457,7 @@
     fastForwardDst(coef, dst, shift_2nd);
 }
 
-static void dct4_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
+void dct4_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
 {
     const int shift_1st = 1 + X265_DEPTH - 8;
     const int shift_2nd = 8;
@@ -473,7 +474,7 @@
     partialButterfly4(coef, dst, shift_2nd, 4);
 }
 
-static void dct8_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
+void dct8_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
 {
     const int shift_1st = 2 + X265_DEPTH - 8;
     const int shift_2nd = 9;
@@ -490,7 +491,7 @@
     partialButterfly8(coef, dst, shift_2nd, 8);
 }
 
-static void dct16_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
+void dct16_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
 {
     const int shift_1st = 3 + X265_DEPTH - 8;
     const int shift_2nd = 10;
@@ -507,7 +508,7 @@
     partialButterfly16(coef, dst, shift_2nd, 16);
 }
 
-static void dct32_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
+void dct32_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
 {
     const int shift_1st = 4 + X265_DEPTH - 8;
     const int shift_2nd = 11;
@@ -524,7 +525,7 @@
     partialButterfly32(coef, dst, shift_2nd, 32);
 }
 
-static void idst4_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
+void idst4_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
 {
     const int shift_1st = 7;
     const int shift_2nd = 12 - (X265_DEPTH - 8);
@@ -541,7 +542,7 @@
     }
 }
 
-static void idct4_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
+void idct4_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
 {
     const int shift_1st = 7;
     const int shift_2nd = 12 - (X265_DEPTH - 8);
@@ -558,7 +559,7 @@
     }
 }
 
-static void idct8_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
+void idct8_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
 {
     const int shift_1st = 7;
     const int shift_2nd = 12 - (X265_DEPTH - 8);
@@ -575,7 +576,7 @@
     }
 }
 
-static void idct16_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
+void idct16_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
 {
     const int shift_1st = 7;
     const int shift_2nd = 12 - (X265_DEPTH - 8);
@@ -592,7 +593,7 @@
     }
 }
 
-static void idct32_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
+void idct32_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
 {
     const int shift_1st = 7;
     const int shift_2nd = 12 - (X265_DEPTH - 8);
@@ -608,6 +609,7 @@
         memcpy(&dsti * dstStride, &blocki * 32, 32 * sizeof(int16_t));
     }
 }
+} // namespace X265_NS
 
 static void dequant_normal_c(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift)
 {
​

x265_3.6.tar.gz/source/common/deblock.cpp -> x265_4.0.tar.gz/source/common/deblock.cpp Changed

 
@@ -316,7 +316,7 @@
 
 void Deblock::edgeFilterLuma(const CUData* cuQ, uint32_t absPartIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockStrength)
 {
-    PicYuv* reconPic = cuQ->m_encData->m_reconPic;
+    PicYuv* reconPic = cuQ->m_encData->m_reconPic0;
     pixel* src = reconPic->getLumaAddr(cuQ->m_cuAddr, absPartIdx);
     intptr_t stride = reconPic->m_stride;
     const PPS* pps = cuQ->m_slice->m_pps;
@@ -429,7 +429,7 @@
                 : ((g_zscanToPelYabsPartIdx + edge * UNIT_SIZE) >> cuQ->m_vChromaShift)) % DEBLOCK_SMALLEST_BLOCK == 0,
                "invalid edge\n");
 
-    PicYuv* reconPic = cuQ->m_encData->m_reconPic;
+    PicYuv* reconPic = cuQ->m_encData->m_reconPic0;
     intptr_t stride = reconPic->m_strideC;
     intptr_t srcOffset = reconPic->getChromaAddrOffset(cuQ->m_cuAddr, absPartIdx);
     bool bCheckNoFilter = pps->bTransquantBypassEnabled;
​

x265_3.6.tar.gz/source/common/frame.cpp -> x265_4.0.tar.gz/source/common/frame.cpp Changed

@@ -37,7 +37,8 @@
     m_reconColCount = NULL;
     m_countRefEncoders = 0;
     m_encData = NULL;
-    m_reconPic = NULL;
+    for (int i = 0; i < NUM_RECON_VERSION; i++)
+        m_reconPici = NULL;
     m_quantOffsets = NULL;
     m_next = NULL;
     m_prev = NULL;
@@ -75,6 +76,11 @@
 
     m_tempLayer = 0;
     m_sameLayerRefPic = false;
+
+    m_viewId = 0;
+    m_valid = 0;
+    m_nextSubDPB = NULL;
+    m_prevSubDPB = NULL;
 }
 
 bool Frame::create(x265_param *param, float* quantOffsets)
@@ -85,6 +91,7 @@
     if (m_param->bEnableTemporalFilter)
     {
         m_mcstf = new TemporalFilter;
+        m_mcstf->m_range = param->mcstfFrameRange;
         m_mcstf->init(param);
 
         m_fencPicSubsampled2 = new PicYuv;
@@ -198,29 +205,35 @@
 bool Frame::allocEncodeData(x265_param *param, const SPS& sps)
 {
     m_encData = new FrameData;
-    m_reconPic = new PicYuv;
     m_param = param;
-    m_encData->m_reconPic = m_reconPic;
-    bool ok = m_encData->create(*param, sps, m_fencPic->m_picCsp) && m_reconPic->create(param);
+    for (int i = 0; i < !!m_param->bEnableSCC + 1; i++)
+    {
+        m_reconPici = new PicYuv;
+        m_encData->m_reconPici = m_reconPici;
+    }
+    bool ok = m_encData->create(*param, sps, m_fencPic->m_picCsp) && m_reconPic0->create(param) && (param->bEnableSCC ? (param->bEnableSCC && m_reconPic1->create(param)) : 1);
     if (ok)
     {
-        /* initialize right border of m_reconpicYuv as SAO may read beyond the
+        /* initialize right border of m_reconPicYuv as SAO may read beyond the
          * end of the picture accessing uninitialized pixels */
         int maxHeight = sps.numCuInHeight * param->maxCUSize;
-        memset(m_reconPic->m_picOrg0, 0, sizeof(pixel)* m_reconPic->m_stride * maxHeight);
+        memset(m_reconPic0->m_picOrg0, 0, sizeof(pixel)* m_reconPic0->m_stride * maxHeight);
 
-        /* use pre-calculated cu/pu offsets cached in the SPS structure */
-        m_reconPic->m_cuOffsetY = sps.cuOffsetY;
-        m_reconPic->m_buOffsetY = sps.buOffsetY;
-
-        if (param->internalCsp != X265_CSP_I400)
+        for (int i = 0; i < !!m_param->bEnableSCC + 1; i++)
         {
-            memset(m_reconPic->m_picOrg1, 0, sizeof(pixel) * m_reconPic->m_strideC * (maxHeight >> m_reconPic->m_vChromaShift));
-            memset(m_reconPic->m_picOrg2, 0, sizeof(pixel) * m_reconPic->m_strideC * (maxHeight >> m_reconPic->m_vChromaShift));
-
             /* use pre-calculated cu/pu offsets cached in the SPS structure */
-            m_reconPic->m_cuOffsetC = sps.cuOffsetC;
-            m_reconPic->m_buOffsetC = sps.buOffsetC;
+            m_reconPici->m_cuOffsetY = sps.cuOffsetY;
+            m_reconPici->m_buOffsetY = sps.buOffsetY;
+
+            if (param->internalCsp != X265_CSP_I400)
+            {
+                memset(m_reconPici->m_picOrg1, 0, sizeof(pixel) * m_reconPici->m_strideC * (maxHeight >> m_reconPici->m_vChromaShift));
+                memset(m_reconPici->m_picOrg2, 0, sizeof(pixel) * m_reconPici->m_strideC * (maxHeight >> m_reconPici->m_vChromaShift));
+
+                /* use pre-calculated cu/pu offsets cached in the SPS structure */
+                m_reconPici->m_cuOffsetC = sps.cuOffsetC;
+                m_reconPici->m_buOffsetC = sps.buOffsetC;
+            }
         }
     }
     return ok;
@@ -230,7 +243,8 @@
 void Frame::reinit(const SPS& sps)
 {
     m_bChromaExtended = false;
-    m_reconPic = m_encData->m_reconPic;
+    for (int i = 0; i < !!m_param->bEnableSCC + 1; i++)
+        m_reconPici = m_encData->m_reconPici;
     m_encData->reinit(sps);
 }
 
@@ -243,6 +257,35 @@
         m_encData = NULL;
     }
 
+#if ENABLE_MULTIVIEW
+    //Destroy interlayer References
+    if (refPicSetInterLayer0.size())
+    {
+        Frame* iterFrame = refPicSetInterLayer0.first();
+
+        while (iterFrame)
+        {
+            Frame* curFrame = iterFrame;
+            iterFrame = iterFrame->m_nextSubDPB;
+            refPicSetInterLayer0.removeSubDPB(*curFrame);
+            iterFrame = refPicSetInterLayer0.first();
+        }
+    }
+
+    if (refPicSetInterLayer1.size())
+    {
+        Frame* iterFrame = refPicSetInterLayer1.first();
+
+        while (iterFrame)
+        {
+            Frame* curFrame = iterFrame;
+            iterFrame = iterFrame->m_nextSubDPB;
+            refPicSetInterLayer1.removeSubDPB(*curFrame);
+            iterFrame = refPicSetInterLayer1.first();
+        }
+    }
+#endif
+
     if (m_fencPic)
     {
         if (m_param->bCopyPicToFrame)
@@ -271,11 +314,14 @@
         X265_FREE(m_isSubSampled);
     }
 
-    if (m_reconPic)
+    for (int i = 0; i < !!m_param->bEnableSCC + 1; i++)
     {
-        m_reconPic->destroy();
-        delete m_reconPic;
-        m_reconPic = NULL;
+        if (m_reconPici)
+        {
+            m_reconPici->destroy();
+            delete m_reconPici;
+            m_reconPici = NULL;
+        }
     }
 
     if (m_reconRowFlag)

 
@@ -37,7 +37,8 @@
     m_reconColCount = NULL;
     m_countRefEncoders = 0;
     m_encData = NULL;
-    m_reconPic = NULL;
+    for (int i = 0; i < NUM_RECON_VERSION; i++)
+        m_reconPici = NULL;
     m_quantOffsets = NULL;
     m_next = NULL;
     m_prev = NULL;
@@ -75,6 +76,11 @@
 
     m_tempLayer = 0;
     m_sameLayerRefPic = false;
+
+    m_viewId = 0;
+    m_valid = 0;
+    m_nextSubDPB = NULL;
+    m_prevSubDPB = NULL;
 }
 
 bool Frame::create(x265_param *param, float* quantOffsets)
@@ -85,6 +91,7 @@
     if (m_param->bEnableTemporalFilter)
     {
         m_mcstf = new TemporalFilter;
+        m_mcstf->m_range = param->mcstfFrameRange;
         m_mcstf->init(param);
 
         m_fencPicSubsampled2 = new PicYuv;
@@ -198,29 +205,35 @@
 bool Frame::allocEncodeData(x265_param *param, const SPS& sps)
 {
     m_encData = new FrameData;
-    m_reconPic = new PicYuv;
     m_param = param;
-    m_encData->m_reconPic = m_reconPic;
-    bool ok = m_encData->create(*param, sps, m_fencPic->m_picCsp) && m_reconPic->create(param);
+    for (int i = 0; i < !!m_param->bEnableSCC + 1; i++)
+    {
+        m_reconPici = new PicYuv;
+        m_encData->m_reconPici = m_reconPici;
+    }
+    bool ok = m_encData->create(*param, sps, m_fencPic->m_picCsp) && m_reconPic0->create(param) && (param->bEnableSCC ? (param->bEnableSCC && m_reconPic1->create(param)) : 1);
     if (ok)
     {
-        /* initialize right border of m_reconpicYuv as SAO may read beyond the
+        /* initialize right border of m_reconPicYuv as SAO may read beyond the
          * end of the picture accessing uninitialized pixels */
         int maxHeight = sps.numCuInHeight * param->maxCUSize;
-        memset(m_reconPic->m_picOrg0, 0, sizeof(pixel)* m_reconPic->m_stride * maxHeight);
+        memset(m_reconPic0->m_picOrg0, 0, sizeof(pixel)* m_reconPic0->m_stride * maxHeight);
 
-        /* use pre-calculated cu/pu offsets cached in the SPS structure */
-        m_reconPic->m_cuOffsetY = sps.cuOffsetY;
-        m_reconPic->m_buOffsetY = sps.buOffsetY;
-
-        if (param->internalCsp != X265_CSP_I400)
+        for (int i = 0; i < !!m_param->bEnableSCC + 1; i++)
         {
-            memset(m_reconPic->m_picOrg1, 0, sizeof(pixel) * m_reconPic->m_strideC * (maxHeight >> m_reconPic->m_vChromaShift));
-            memset(m_reconPic->m_picOrg2, 0, sizeof(pixel) * m_reconPic->m_strideC * (maxHeight >> m_reconPic->m_vChromaShift));
-
             /* use pre-calculated cu/pu offsets cached in the SPS structure */
-            m_reconPic->m_cuOffsetC = sps.cuOffsetC;
-            m_reconPic->m_buOffsetC = sps.buOffsetC;
+            m_reconPici->m_cuOffsetY = sps.cuOffsetY;
+            m_reconPici->m_buOffsetY = sps.buOffsetY;
+
+            if (param->internalCsp != X265_CSP_I400)
+            {
+                memset(m_reconPici->m_picOrg1, 0, sizeof(pixel) * m_reconPici->m_strideC * (maxHeight >> m_reconPici->m_vChromaShift));
+                memset(m_reconPici->m_picOrg2, 0, sizeof(pixel) * m_reconPici->m_strideC * (maxHeight >> m_reconPici->m_vChromaShift));
+
+                /* use pre-calculated cu/pu offsets cached in the SPS structure */
+                m_reconPici->m_cuOffsetC = sps.cuOffsetC;
+                m_reconPici->m_buOffsetC = sps.buOffsetC;
+            }
         }
     }
     return ok;
@@ -230,7 +243,8 @@
 void Frame::reinit(const SPS& sps)
 {
     m_bChromaExtended = false;
-    m_reconPic = m_encData->m_reconPic;
+    for (int i = 0; i < !!m_param->bEnableSCC + 1; i++)
+        m_reconPici = m_encData->m_reconPici;
     m_encData->reinit(sps);
 }
 
@@ -243,6 +257,35 @@
         m_encData = NULL;
     }
 
+#if ENABLE_MULTIVIEW
+    //Destroy interlayer References
+    if (refPicSetInterLayer0.size())
+    {
+        Frame* iterFrame = refPicSetInterLayer0.first();
+
+        while (iterFrame)
+        {
+            Frame* curFrame = iterFrame;
+            iterFrame = iterFrame->m_nextSubDPB;
+            refPicSetInterLayer0.removeSubDPB(*curFrame);
+            iterFrame = refPicSetInterLayer0.first();
+        }
+    }
+
+    if (refPicSetInterLayer1.size())
+    {
+        Frame* iterFrame = refPicSetInterLayer1.first();
+
+        while (iterFrame)
+        {
+            Frame* curFrame = iterFrame;
+            iterFrame = iterFrame->m_nextSubDPB;
+            refPicSetInterLayer1.removeSubDPB(*curFrame);
+            iterFrame = refPicSetInterLayer1.first();
+        }
+    }
+#endif
+
     if (m_fencPic)
     {
         if (m_param->bCopyPicToFrame)
@@ -271,11 +314,14 @@
         X265_FREE(m_isSubSampled);
     }
 
-    if (m_reconPic)
+    for (int i = 0; i < !!m_param->bEnableSCC + 1; i++)
     {
-        m_reconPic->destroy();
-        delete m_reconPic;
-        m_reconPic = NULL;
+        if (m_reconPici)
+        {
+            m_reconPici->destroy();
+            delete m_reconPici;
+            m_reconPici = NULL;
+        }
     }
 
     if (m_reconRowFlag)
​

x265_3.6.tar.gz/source/common/frame.h -> x265_4.0.tar.gz/source/common/frame.h Changed

@@ -81,13 +81,16 @@
     /* These two items will be NULL until the Frame begins to be encoded, at which point
      * it will be assigned a FrameData instance, which comes with a reconstructed image PicYuv */
     FrameData*             m_encData;
-    PicYuv*                m_reconPic;
+    PicYuv*                m_reconPicNUM_RECON_VERSION;
 
     /* Data associated with x265_picture */
     PicYuv*                m_fencPic;
     PicYuv*                m_fencPicSubsampled2;
     PicYuv*                m_fencPicSubsampled4;
 
+    PicList                refPicSetInterLayer0;
+    PicList                refPicSetInterLayer1;
+
     int                    m_poc;
     int                    m_encodeOrder;
     int                    m_gopOffset;
@@ -161,6 +164,13 @@
     int8_t                 m_gopId;
     bool                   m_sameLayerRefPic;
 
+    int                    m_sLayerId;
+    bool                   m_valid;
+
+    int                    m_viewId;
+    Frame*                 m_nextSubDPB;           // PicList doubly linked list pointers
+    Frame*                 m_prevSubDPB;
+
     Frame();
 
     bool create(x265_param *param, float* quantOffsets);

 
@@ -81,13 +81,16 @@
     /* These two items will be NULL until the Frame begins to be encoded, at which point
      * it will be assigned a FrameData instance, which comes with a reconstructed image PicYuv */
     FrameData*             m_encData;
-    PicYuv*                m_reconPic;
+    PicYuv*                m_reconPicNUM_RECON_VERSION;
 
     /* Data associated with x265_picture */
     PicYuv*                m_fencPic;
     PicYuv*                m_fencPicSubsampled2;
     PicYuv*                m_fencPicSubsampled4;
 
+    PicList                refPicSetInterLayer0;
+    PicList                refPicSetInterLayer1;
+
     int                    m_poc;
     int                    m_encodeOrder;
     int                    m_gopOffset;
@@ -161,6 +164,13 @@
     int8_t                 m_gopId;
     bool                   m_sameLayerRefPic;
 
+    int                    m_sLayerId;
+    bool                   m_valid;
+
+    int                    m_viewId;
+    Frame*                 m_nextSubDPB;           // PicList doubly linked list pointers
+    Frame*                 m_prevSubDPB;
+
     Frame();
 
     bool create(x265_param *param, float* quantOffsets);
​

x265_3.6.tar.gz/source/common/framedata.h -> x265_4.0.tar.gz/source/common/framedata.h Changed

 
@@ -115,7 +115,7 @@
     const x265_param* m_param;
 
     FrameData*     m_freeListNext;
-    PicYuv*        m_reconPic;
+    PicYuv*        m_reconPicNUM_RECON_VERSION;
     bool           m_bHasReferences;   /* used during DPB/RPS updates */
     int            m_frameEncoderID;   /* the ID of the FrameEncoder encoding this frame */
     JobProvider*   m_jobProvider;
​

x265_3.6.tar.gz/source/common/ipfilter.cpp -> x265_4.0.tar.gz/source/common/ipfilter.cpp Changed

 
@@ -34,8 +34,8 @@
 #pragma warning(disable: 4127) // conditional expression is constant, typical for templated functions
 #endif
 
-namespace {
-// file local namespace
+namespace X265_NS {
+// x265 private namespace
 
 template<int width, int height>
 void filterPixelToShort_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride)
@@ -367,10 +367,6 @@
     interp_horiz_ps_c<N, width, height>(src, srcStride, immed, width, idxX, 1);
     filterVertical_sp_c<N>(immed + (N / 2 - 1) * width, width, dst, dstStride, width, height, idxY);
 }
-}
-
-namespace X265_NS {
-// x265 private namespace
 
 #define CHROMA_420(W, H) \
     p.chromaX265_CSP_I420.puCHROMA_420_ ## W ## x ## H.filter_hpp = interp_horiz_pp_c<4, W, H>; \
​

x265_3.6.tar.gz/source/common/loopfilter.cpp -> x265_4.0.tar.gz/source/common/loopfilter.cpp Changed

@@ -30,16 +30,10 @@
 
 namespace {
 
-/* get the sign of input variable (TODO: this is a dup, make common) */
-inline int8_t signOf(int x)
-{
-    return (x >> 31) | ((int)((((uint32_t)-x)) >> 31));
-}
-
 static void calSign(int8_t *dst, const pixel *src1, const pixel *src2, const int endX)
 {
     for (int x = 0; x < endX; x++)
-        dstx = signOf(src1x - src2x);
+        dstx = x265_signOf(src1x - src2x);
 }
 
 static void processSaoCUE0(pixel * rec, int8_t * offsetEo, int width, int8_t* signLeft, intptr_t stride)
@@ -70,7 +64,7 @@
 
     for (x = 0; x < width; x++)
     {
-        signDown = signOf(recx - recx + stride);
+        signDown = x265_signOf(recx - recx + stride);
         edgeType = signDown + upBuff1x + 2;
         upBuff1x = -signDown;
         recx = x265_clip(recx + offsetEoedgeType);
@@ -87,7 +81,7 @@
     {
         for (x = 0; x < width; x++)
         {
-            signDown = signOf(recx - recx + stride);
+            signDown = x265_signOf(recx - recx + stride);
             edgeType = signDown + upBuff1x + 2;
             upBuff1x = -signDown;
             recx = x265_clip(recx + offsetEoedgeType);
@@ -101,7 +95,7 @@
     int x;
     for (x = 0; x < width; x++)
     {
-        int8_t signDown = signOf(recx - recx + stride + 1);
+        int8_t signDown = x265_signOf(recx - recx + stride + 1);
         int edgeType = signDown + buff1x + 2;
         bufftx + 1 = -signDown;
         recx = x265_clip(recx + offsetEoedgeType);;
@@ -115,7 +109,7 @@
 
     for (int x = startX + 1; x < endX; x++)
     {
-        signDown = signOf(recx - recx + stride);
+        signDown = x265_signOf(recx - recx + stride);
         edgeType = signDown + upBuff1x + 2;
         upBuff1x - 1 = -signDown;
         recx = x265_clip(recx + offsetEoedgeType);

 
@@ -30,16 +30,10 @@
 
 namespace {
 
-/* get the sign of input variable (TODO: this is a dup, make common) */
-inline int8_t signOf(int x)
-{
-    return (x >> 31) | ((int)((((uint32_t)-x)) >> 31));
-}
-
 static void calSign(int8_t *dst, const pixel *src1, const pixel *src2, const int endX)
 {
     for (int x = 0; x < endX; x++)
-        dstx = signOf(src1x - src2x);
+        dstx = x265_signOf(src1x - src2x);
 }
 
 static void processSaoCUE0(pixel * rec, int8_t * offsetEo, int width, int8_t* signLeft, intptr_t stride)
@@ -70,7 +64,7 @@
 
     for (x = 0; x < width; x++)
     {
-        signDown = signOf(recx - recx + stride);
+        signDown = x265_signOf(recx - recx + stride);
         edgeType = signDown + upBuff1x + 2;
         upBuff1x = -signDown;
         recx = x265_clip(recx + offsetEoedgeType);
@@ -87,7 +81,7 @@
     {
         for (x = 0; x < width; x++)
         {
-            signDown = signOf(recx - recx + stride);
+            signDown = x265_signOf(recx - recx + stride);
             edgeType = signDown + upBuff1x + 2;
             upBuff1x = -signDown;
             recx = x265_clip(recx + offsetEoedgeType);
@@ -101,7 +95,7 @@
     int x;
     for (x = 0; x < width; x++)
     {
-        int8_t signDown = signOf(recx - recx + stride + 1);
+        int8_t signDown = x265_signOf(recx - recx + stride + 1);
         int edgeType = signDown + buff1x + 2;
         bufftx + 1 = -signDown;
         recx = x265_clip(recx + offsetEoedgeType);;
@@ -115,7 +109,7 @@
 
     for (int x = startX + 1; x < endX; x++)
     {
-        signDown = signOf(recx - recx + stride);
+        signDown = x265_signOf(recx - recx + stride);
         edgeType = signDown + upBuff1x + 2;
         upBuff1x - 1 = -signDown;
         recx = x265_clip(recx + offsetEoedgeType);
​

x265_3.6.tar.gz/source/common/lowpassdct.cpp -> x265_4.0.tar.gz/source/common/lowpassdct.cpp Changed

 
@@ -58,7 +58,7 @@
     }
 
     // replace first coef with total block average
-    dst0 = totalSum << 1;
+    dst0 = (X265_DEPTH == 8) ? (totalSum << 1) : (totalSum >> ((X265_DEPTH - 9)));
 }
 
 static void lowPassDct16_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
@@ -83,7 +83,7 @@
     {
         memcpy(&dsti * 16, &coefi * 8, 8 * sizeof(int16_t));
     }
-    dst0 = static_cast<int16_t>(totalSum >> 1);
+    dst0 = static_cast<int16_t>(totalSum >> (1 + (X265_DEPTH - 8)));
 }
 
 static void lowPassDct32_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
@@ -108,7 +108,7 @@
     {
         memcpy(&dsti * 32, &coefi * 16, 16 * sizeof(int16_t));
     }
-    dst0 = static_cast<int16_t>(totalSum >> 3);
+    dst0 = static_cast<int16_t>(totalSum >> (3 + (X265_DEPTH - 8)));
 }
 
 namespace X265_NS {
​

x265_3.6.tar.gz/source/common/param.cpp -> x265_4.0.tar.gz/source/common/param.cpp Changed

@@ -183,6 +183,7 @@
     param->bEnableSceneCutAwareQp = 0;
     param->fwdMaxScenecutWindow = 1200;
     param->bwdMaxScenecutWindow = 600;
+    param->mcstfFrameRange = 2;
     for (int i = 0; i < 6; i++)
     {
         int deltas6 = { 5, 4, 3, 2, 1, 0 };
@@ -391,6 +392,10 @@
     param->bEnableTemporalFilter = 0;
     param->temporalFilterStrength = 0.95;
 
+    /*Alpha Channel Encoding*/
+    param->bEnableAlpha = 0;
+    param->numScalableLayers = 1;
+
 #ifdef SVT_HEVC
     param->svtHevcParam = svtParam;
     svt_param_default(param);
@@ -398,6 +403,15 @@
     /* Film grain characteristics model filename */
     param->filmGrain = NULL;
     param->bEnableSBRC = 0;
+
+    /* Multi-View Encoding*/
+    param->numViews = 1;
+    param->format = 0;
+
+    param->numLayers = 1;
+
+    /* SCC */
+    param->bEnableSCC = 0;
 }
 
 int x265_param_default_preset(x265_param* param, const char* preset, const char* tune)
@@ -417,6 +431,7 @@
 
         if (!strcmp(preset, "ultrafast"))
         {
+            param->mcstfFrameRange = 1;
             param->maxNumMergeCand = 2;
             param->bIntraInBFrames = 0;
             param->lookaheadDepth = 5;
@@ -441,6 +456,7 @@
         }
         else if (!strcmp(preset, "superfast"))
         {
+            param->mcstfFrameRange = 1;
             param->maxNumMergeCand = 2;
             param->bIntraInBFrames = 0;
             param->lookaheadDepth = 10;
@@ -461,6 +477,7 @@
         }
         else if (!strcmp(preset, "veryfast"))
         {
+            param->mcstfFrameRange = 1;
             param->maxNumMergeCand = 2;
             param->limitReferences = 3;
             param->bIntraInBFrames = 0;
@@ -474,6 +491,7 @@
         }
         else if (!strcmp(preset, "faster"))
         {
+            param->mcstfFrameRange = 1;
             param->maxNumMergeCand = 2;
             param->limitReferences = 3;
             param->bIntraInBFrames = 0;
@@ -485,6 +503,7 @@
         }
         else if (!strcmp(preset, "fast"))
         {
+            param->mcstfFrameRange = 1;
             param->maxNumMergeCand = 2;
             param->limitReferences = 3;
             param->bEnableEarlySkip = 0;
@@ -497,6 +516,7 @@
         }
         else if (!strcmp(preset, "medium"))
         {
+            param->mcstfFrameRange = 1;
             /* defaults */
         }
         else if (!strcmp(preset, "slow"))
@@ -1437,6 +1457,33 @@
         OPT("film-grain") p->filmGrain = (char* )value;
         OPT("mcstf") p->bEnableTemporalFilter = atobool(value);
         OPT("sbrc") p->bEnableSBRC = atobool(value);
+#if ENABLE_ALPHA
+        OPT("alpha")
+        {
+            if (atobool(value))
+            {
+                p->bEnableAlpha = 1;
+                p->numScalableLayers = 2;
+                p->numLayers = 2;
+            }
+        }
+#endif
+#if ENABLE_MULTIVIEW
+        OPT("format")
+            p->format = atoi(value);
+        OPT("num-views")
+        {
+            p->numViews = atoi(value);
+        }
+#endif
+#if ENABLE_SCC_EXT
+        OPT("scc")
+        {
+            p->bEnableSCC = atoi(value);
+            if (p->bEnableSCC)
+                p->bEnableWeightedPred = false;
+        }
+#endif
         else
             return X265_PARAM_BAD_NAME;
     }
@@ -1674,7 +1721,7 @@
         CHECK(param->edgeVarThreshold < 0.0f || param->edgeVarThreshold > 1.0f,
               "Minimum edge density percentage for a CU should be an integer between 0 to 100");
     }
-    CHECK(param->bframes && param->bframes >= param->lookaheadDepth && !param->rc.bStatRead,
+    CHECK(param->bframes && (param->bEnableTemporalFilter ? (param->bframes > param->lookaheadDepth) : (param->bframes >= param->lookaheadDepth)) && !param->rc.bStatRead,
           "Lookahead depth must be greater than the max consecutive bframe count");
     CHECK(param->bframes < 0,
           "bframe count should be greater than zero");
@@ -1908,6 +1955,21 @@
         }
     }
     CHECK(param->rc.dataShareMode != X265_SHARE_MODE_FILE && param->rc.dataShareMode != X265_SHARE_MODE_SHAREDMEM, "Invalid data share mode. It must be one of the X265_DATA_SHARE_MODES enum values\n" );
+#if ENABLE_ALPHA
+    if (param->bEnableAlpha)
+    {
+        CHECK((param->internalCsp != X265_CSP_I420), "Alpha encode supported only with i420a colorspace");
+        CHECK((param->analysisMultiPassDistortion || param->analysisMultiPassRefine), "Alpha encode doesnot support multipass feature");
+    }
+#endif
+#if ENABLE_MULTIVIEW
+    CHECK((param->numViews > 2), "Multi-View Encoding currently support only 2 views");
+    CHECK((param->numViews > 1) && (param->internalBitDepth != 8), "BitDepthConstraint must be 8 for Multiview main profile");
+    CHECK((param->numViews > 1) && (param->analysisMultiPassDistortion || param->analysisMultiPassRefine), "Multiview encode doesnot support multipass feature");
+#endif
+#if ENABLE_SCC_EXT
+    CHECK(!!param->bEnableSCC&& param->rdLevel != 6, "Enabling scc extension in x265 requires rdlevel of 6 ");
+#endif
     return check_failed;
 }
 
@@ -2072,6 +2134,12 @@
     TOOLOPT(param->rc.bStatWrite, "stats-write");
     TOOLOPT(param->rc.bStatRead,  "stats-read");
     TOOLOPT(param->bSingleSeiNal, "single-sei");
+#if ENABLE_ALPHA
+    TOOLOPT(param->numScalableLayers > 1, "alpha");
+#endif
+#if ENABLE_MULTIVIEW
+    TOOLOPT(param->numViews > 1, "multi-view");
+#endif
 #if ENABLE_HDR10_PLUS
     TOOLOPT(param->toneMapFile != NULL, "dhdr10-info");
 #endif
@@ -2336,6 +2404,16 @@
     if (p->filmGrain)
         s += sprintf(s, " film-grain=%s", p->filmGrain); // Film grain characteristics model filename
     BOOL(p->bEnableTemporalFilter, "mcstf");
+#if ENABLE_ALPHA
+    BOOL(p->bEnableAlpha, "alpha");
+#endif
+#if ENABLE_MULTIVIEW
+    s += sprintf(s, " num-views=%d", p->numViews);
+    s += sprintf(s, " format=%d", p->format);
+#endif
+#if ENABLE_SCC_EXT
+    s += sprintf(s, "scc=%d", p->bEnableSCC);
+#endif
     BOOL(p->bEnableSBRC, "sbrc");
 #undef BOOL
     return buf;
@@ -2558,6 +2636,7 @@
 
 void x265_copy_params(x265_param* dst, x265_param* src)
 {
+    dst->mcstfFrameRange = src->mcstfFrameRange;
     dst->cpuid = src->cpuid;
     dst->frameNumThreads = src->frameNumThreads;
     if (src->numaPools) dst->numaPools = strdup(src->numaPools);
@@ -2856,6 +2935,18 @@
     dst->confWinRightOffset = src->confWinRightOffset;
     dst->confWinBottomOffset = src->confWinBottomOffset;
     dst->bliveVBV2pass = src->bliveVBV2pass;
+#if ENABLE_ALPHA
+    dst->bEnableAlpha = src->bEnableAlpha;
+    dst->numScalableLayers = src->numScalableLayers;
+#endif
+#if ENABLE_MULTIVIEW
+    dst->numViews = src->numViews;
+    dst->format = src->format;
+#endif
+    dst->numLayers = src->numLayers;
+#if ENABLE_SCC_EXT

 
@@ -183,6 +183,7 @@
     param->bEnableSceneCutAwareQp = 0;
     param->fwdMaxScenecutWindow = 1200;
     param->bwdMaxScenecutWindow = 600;
+    param->mcstfFrameRange = 2;
     for (int i = 0; i < 6; i++)
     {
         int deltas6 = { 5, 4, 3, 2, 1, 0 };
@@ -391,6 +392,10 @@
     param->bEnableTemporalFilter = 0;
     param->temporalFilterStrength = 0.95;
 
+    /*Alpha Channel Encoding*/
+    param->bEnableAlpha = 0;
+    param->numScalableLayers = 1;
+
 #ifdef SVT_HEVC
     param->svtHevcParam = svtParam;
     svt_param_default(param);
@@ -398,6 +403,15 @@
     /* Film grain characteristics model filename */
     param->filmGrain = NULL;
     param->bEnableSBRC = 0;
+
+    /* Multi-View Encoding*/
+    param->numViews = 1;
+    param->format = 0;
+
+    param->numLayers = 1;
+
+    /* SCC */
+    param->bEnableSCC = 0;
 }
 
 int x265_param_default_preset(x265_param* param, const char* preset, const char* tune)
@@ -417,6 +431,7 @@
 
         if (!strcmp(preset, "ultrafast"))
         {
+            param->mcstfFrameRange = 1;
             param->maxNumMergeCand = 2;
             param->bIntraInBFrames = 0;
             param->lookaheadDepth = 5;
@@ -441,6 +456,7 @@
         }
         else if (!strcmp(preset, "superfast"))
         {
+            param->mcstfFrameRange = 1;
             param->maxNumMergeCand = 2;
             param->bIntraInBFrames = 0;
             param->lookaheadDepth = 10;
@@ -461,6 +477,7 @@
         }
         else if (!strcmp(preset, "veryfast"))
         {
+            param->mcstfFrameRange = 1;
             param->maxNumMergeCand = 2;
             param->limitReferences = 3;
             param->bIntraInBFrames = 0;
@@ -474,6 +491,7 @@
         }
         else if (!strcmp(preset, "faster"))
         {
+            param->mcstfFrameRange = 1;
             param->maxNumMergeCand = 2;
             param->limitReferences = 3;
             param->bIntraInBFrames = 0;
@@ -485,6 +503,7 @@
         }
         else if (!strcmp(preset, "fast"))
         {
+            param->mcstfFrameRange = 1;
             param->maxNumMergeCand = 2;
             param->limitReferences = 3;
             param->bEnableEarlySkip = 0;
@@ -497,6 +516,7 @@
         }
         else if (!strcmp(preset, "medium"))
         {
+            param->mcstfFrameRange = 1;
             /* defaults */
         }
         else if (!strcmp(preset, "slow"))
@@ -1437,6 +1457,33 @@
         OPT("film-grain") p->filmGrain = (char* )value;
         OPT("mcstf") p->bEnableTemporalFilter = atobool(value);
         OPT("sbrc") p->bEnableSBRC = atobool(value);
+#if ENABLE_ALPHA
+        OPT("alpha")
+        {
+            if (atobool(value))
+            {
+                p->bEnableAlpha = 1;
+                p->numScalableLayers = 2;
+                p->numLayers = 2;
+            }
+        }
+#endif
+#if ENABLE_MULTIVIEW
+        OPT("format")
+            p->format = atoi(value);
+        OPT("num-views")
+        {
+            p->numViews = atoi(value);
+        }
+#endif
+#if ENABLE_SCC_EXT
+        OPT("scc")
+        {
+            p->bEnableSCC = atoi(value);
+            if (p->bEnableSCC)
+                p->bEnableWeightedPred = false;
+        }
+#endif
         else
             return X265_PARAM_BAD_NAME;
     }
@@ -1674,7 +1721,7 @@
         CHECK(param->edgeVarThreshold < 0.0f || param->edgeVarThreshold > 1.0f,
               "Minimum edge density percentage for a CU should be an integer between 0 to 100");
     }
-    CHECK(param->bframes && param->bframes >= param->lookaheadDepth && !param->rc.bStatRead,
+    CHECK(param->bframes && (param->bEnableTemporalFilter ? (param->bframes > param->lookaheadDepth) : (param->bframes >= param->lookaheadDepth)) && !param->rc.bStatRead,
           "Lookahead depth must be greater than the max consecutive bframe count");
     CHECK(param->bframes < 0,
           "bframe count should be greater than zero");
@@ -1908,6 +1955,21 @@
         }
     }
     CHECK(param->rc.dataShareMode != X265_SHARE_MODE_FILE && param->rc.dataShareMode != X265_SHARE_MODE_SHAREDMEM, "Invalid data share mode. It must be one of the X265_DATA_SHARE_MODES enum values\n" );
+#if ENABLE_ALPHA
+    if (param->bEnableAlpha)
+    {
+        CHECK((param->internalCsp != X265_CSP_I420), "Alpha encode supported only with i420a colorspace");
+        CHECK((param->analysisMultiPassDistortion || param->analysisMultiPassRefine), "Alpha encode doesnot support multipass feature");
+    }
+#endif
+#if ENABLE_MULTIVIEW
+    CHECK((param->numViews > 2), "Multi-View Encoding currently support only 2 views");
+    CHECK((param->numViews > 1) && (param->internalBitDepth != 8), "BitDepthConstraint must be 8 for Multiview main profile");
+    CHECK((param->numViews > 1) && (param->analysisMultiPassDistortion || param->analysisMultiPassRefine), "Multiview encode doesnot support multipass feature");
+#endif
+#if ENABLE_SCC_EXT
+    CHECK(!!param->bEnableSCC&& param->rdLevel != 6, "Enabling scc extension in x265 requires rdlevel of 6 ");
+#endif
     return check_failed;
 }
 
@@ -2072,6 +2134,12 @@
     TOOLOPT(param->rc.bStatWrite, "stats-write");
     TOOLOPT(param->rc.bStatRead,  "stats-read");
     TOOLOPT(param->bSingleSeiNal, "single-sei");
+#if ENABLE_ALPHA
+    TOOLOPT(param->numScalableLayers > 1, "alpha");
+#endif
+#if ENABLE_MULTIVIEW
+    TOOLOPT(param->numViews > 1, "multi-view");
+#endif
 #if ENABLE_HDR10_PLUS
     TOOLOPT(param->toneMapFile != NULL, "dhdr10-info");
 #endif
@@ -2336,6 +2404,16 @@
     if (p->filmGrain)
         s += sprintf(s, " film-grain=%s", p->filmGrain); // Film grain characteristics model filename
     BOOL(p->bEnableTemporalFilter, "mcstf");
+#if ENABLE_ALPHA
+    BOOL(p->bEnableAlpha, "alpha");
+#endif
+#if ENABLE_MULTIVIEW
+    s += sprintf(s, " num-views=%d", p->numViews);
+    s += sprintf(s, " format=%d", p->format);
+#endif
+#if ENABLE_SCC_EXT
+    s += sprintf(s, "scc=%d", p->bEnableSCC);
+#endif
     BOOL(p->bEnableSBRC, "sbrc");
 #undef BOOL
     return buf;
@@ -2558,6 +2636,7 @@
 
 void x265_copy_params(x265_param* dst, x265_param* src)
 {
+    dst->mcstfFrameRange = src->mcstfFrameRange;
     dst->cpuid = src->cpuid;
     dst->frameNumThreads = src->frameNumThreads;
     if (src->numaPools) dst->numaPools = strdup(src->numaPools);
@@ -2856,6 +2935,18 @@
     dst->confWinRightOffset = src->confWinRightOffset;
     dst->confWinBottomOffset = src->confWinBottomOffset;
     dst->bliveVBV2pass = src->bliveVBV2pass;
+#if ENABLE_ALPHA
+    dst->bEnableAlpha = src->bEnableAlpha;
+    dst->numScalableLayers = src->numScalableLayers;
+#endif
+#if ENABLE_MULTIVIEW
+    dst->numViews = src->numViews;
+    dst->format = src->format;
+#endif
+    dst->numLayers = src->numLayers;
+#if ENABLE_SCC_EXT
​

x265_3.6.tar.gz/source/common/piclist.cpp -> x265_4.0.tar.gz/source/common/piclist.cpp Changed

@@ -82,6 +82,82 @@
     m_count++;
 }
 
+#if ENABLE_MULTIVIEW
+Frame* PicList::popFrontSubDPB()
+{
+    if (m_start)
+    {
+        Frame* temp = m_start;
+        m_count--;
+
+        if (m_count)
+        {
+            m_start = m_start->m_nextSubDPB;
+            m_start->m_prevSubDPB = NULL;
+        }
+        else
+        {
+            m_start = m_end = NULL;
+        }
+        temp->m_next = temp->m_prev = NULL;
+        return temp;
+    }
+    else
+        return NULL;
+}
+
+void PicList::pushBackSubDPB(Frame& curFrame)
+{
+    X265_CHECK(!curFrame.m_nextSubDPB && !curFrame.m_prevSubDPB, "piclist: picture already in Sub DPB list\n"); // ensure frame is not in a list
+    curFrame.m_nextSubDPB = NULL;
+    curFrame.m_prevSubDPB = m_end;
+
+    if (m_count)
+    {
+        m_end->m_nextSubDPB = &curFrame;
+        m_end = &curFrame;
+    }
+    else
+    {
+        m_start = m_end = &curFrame;
+    }
+    m_count++;
+}
+
+void PicList::removeSubDPB(Frame& curFrame)
+{
+#if _DEBUG
+    Frame* tmp = m_start;
+    while (tmp && tmp != &curFrame)
+    {
+        tmp = tmp->m_nextSubDPB;
+    }
+
+    X265_CHECK(tmp == &curFrame, "piclist: pic being removed was not in list\n"); // verify pic is in this list
+#endif
+
+    m_count--;
+    if (m_count)
+    {
+        if (m_start == &curFrame)
+            m_start = curFrame.m_nextSubDPB;
+        if (m_end == &curFrame)
+            m_end = curFrame.m_prevSubDPB;
+
+        if (curFrame.m_nextSubDPB)
+            curFrame.m_nextSubDPB->m_prevSubDPB = curFrame.m_prevSubDPB;
+        if (curFrame.m_prevSubDPB)
+            curFrame.m_prevSubDPB->m_nextSubDPB = curFrame.m_nextSubDPB;
+    }
+    else
+    {
+        m_start = m_end = NULL;
+    }
+
+    curFrame.m_nextSubDPB = curFrame.m_prevSubDPB = NULL;
+}
+#endif
+
 void PicList::pushBackMCSTF(Frame& curFrame)
 {
     X265_CHECK(!curFrame.m_nextMCSTF && !curFrame.m_prevMCSTF, "piclist: picture already in OPB list\n"); // ensure frame is not in a list
@@ -123,11 +199,16 @@
         return NULL;
 }
 
-Frame* PicList::getPOC(int poc)
+Frame* PicList::getPOC(int poc, int sLayerId)
 {
     Frame *curFrame = m_start;
-    while (curFrame && curFrame->m_poc != poc)
+    int layer = curFrame->m_param->numViews > 1 ? curFrame->m_viewId : (curFrame->m_param->numScalableLayers > 1) ? curFrame->m_sLayerId : 0;
+    while (curFrame && (curFrame->m_poc != poc || layer != sLayerId))
+    {
         curFrame = curFrame->m_next;
+        if(curFrame)
+            layer = curFrame->m_param->numViews > 1 ? curFrame->m_viewId : (curFrame->m_param->numScalableLayers > 1) ? curFrame->m_sLayerId : 0;
+    }
     return curFrame;
 }
 
@@ -185,10 +266,11 @@
         return NULL;
 }
 
-Frame* PicList::getCurFrame(void)
+Frame* PicList::getCurFrame(int sLayer)
 {
     Frame *curFrame = m_start;
-    if (curFrame != NULL)
+    int layer = curFrame->m_param->numViews > 1 ? curFrame->m_viewId : (curFrame->m_param->numScalableLayers > 1) ? curFrame->m_sLayerId : 0;
+    if (layer == sLayer && curFrame != NULL)
         return curFrame;
     else
         return NULL;
@@ -227,6 +309,42 @@
     curFrame.m_next = curFrame.m_prev = NULL;
 }
 
+
+Frame* PicList::removeFrame(Frame& curFrame)
+{
+    Frame* tmp = &curFrame;
+#if _DEBUG
+    tmp = m_start;
+    while (tmp && tmp != &curFrame)
+    {
+        tmp = tmp->m_next;
+    }
+
+    X265_CHECK(tmp == &curFrame, "piclist: pic being removed was not in list\n"); // verify pic is in this list
+#endif
+
+    m_count--;
+    if (m_count)
+    {
+        if (m_start == &curFrame)
+            m_start = curFrame.m_next;
+        if (m_end == &curFrame)
+            m_end = curFrame.m_prev;
+
+        if (curFrame.m_next)
+            curFrame.m_next->m_prev = curFrame.m_prev;
+        if (curFrame.m_prev)
+            curFrame.m_prev->m_next = curFrame.m_next;
+    }
+    else
+    {
+        m_start = m_end = NULL;
+    }
+
+    curFrame.m_next = curFrame.m_prev = NULL;
+    return tmp;
+}
+
 void PicList::removeMCSTF(Frame& curFrame)
 {
 #if _DEBUG

 
@@ -82,6 +82,82 @@
     m_count++;
 }
 
+#if ENABLE_MULTIVIEW
+Frame* PicList::popFrontSubDPB()
+{
+    if (m_start)
+    {
+        Frame* temp = m_start;
+        m_count--;
+
+        if (m_count)
+        {
+            m_start = m_start->m_nextSubDPB;
+            m_start->m_prevSubDPB = NULL;
+        }
+        else
+        {
+            m_start = m_end = NULL;
+        }
+        temp->m_next = temp->m_prev = NULL;
+        return temp;
+    }
+    else
+        return NULL;
+}
+
+void PicList::pushBackSubDPB(Frame& curFrame)
+{
+    X265_CHECK(!curFrame.m_nextSubDPB && !curFrame.m_prevSubDPB, "piclist: picture already in Sub DPB list\n"); // ensure frame is not in a list
+    curFrame.m_nextSubDPB = NULL;
+    curFrame.m_prevSubDPB = m_end;
+
+    if (m_count)
+    {
+        m_end->m_nextSubDPB = &curFrame;
+        m_end = &curFrame;
+    }
+    else
+    {
+        m_start = m_end = &curFrame;
+    }
+    m_count++;
+}
+
+void PicList::removeSubDPB(Frame& curFrame)
+{
+#if _DEBUG
+    Frame* tmp = m_start;
+    while (tmp && tmp != &curFrame)
+    {
+        tmp = tmp->m_nextSubDPB;
+    }
+
+    X265_CHECK(tmp == &curFrame, "piclist: pic being removed was not in list\n"); // verify pic is in this list
+#endif
+
+    m_count--;
+    if (m_count)
+    {
+        if (m_start == &curFrame)
+            m_start = curFrame.m_nextSubDPB;
+        if (m_end == &curFrame)
+            m_end = curFrame.m_prevSubDPB;
+
+        if (curFrame.m_nextSubDPB)
+            curFrame.m_nextSubDPB->m_prevSubDPB = curFrame.m_prevSubDPB;
+        if (curFrame.m_prevSubDPB)
+            curFrame.m_prevSubDPB->m_nextSubDPB = curFrame.m_nextSubDPB;
+    }
+    else
+    {
+        m_start = m_end = NULL;
+    }
+
+    curFrame.m_nextSubDPB = curFrame.m_prevSubDPB = NULL;
+}
+#endif
+
 void PicList::pushBackMCSTF(Frame& curFrame)
 {
     X265_CHECK(!curFrame.m_nextMCSTF && !curFrame.m_prevMCSTF, "piclist: picture already in OPB list\n"); // ensure frame is not in a list
@@ -123,11 +199,16 @@
         return NULL;
 }
 
-Frame* PicList::getPOC(int poc)
+Frame* PicList::getPOC(int poc, int sLayerId)
 {
     Frame *curFrame = m_start;
-    while (curFrame && curFrame->m_poc != poc)
+    int layer = curFrame->m_param->numViews > 1 ? curFrame->m_viewId : (curFrame->m_param->numScalableLayers > 1) ? curFrame->m_sLayerId : 0;
+    while (curFrame && (curFrame->m_poc != poc || layer != sLayerId))
+    {
         curFrame = curFrame->m_next;
+        if(curFrame)
+            layer = curFrame->m_param->numViews > 1 ? curFrame->m_viewId : (curFrame->m_param->numScalableLayers > 1) ? curFrame->m_sLayerId : 0;
+    }
     return curFrame;
 }
 
@@ -185,10 +266,11 @@
         return NULL;
 }
 
-Frame* PicList::getCurFrame(void)
+Frame* PicList::getCurFrame(int sLayer)
 {
     Frame *curFrame = m_start;
-    if (curFrame != NULL)
+    int layer = curFrame->m_param->numViews > 1 ? curFrame->m_viewId : (curFrame->m_param->numScalableLayers > 1) ? curFrame->m_sLayerId : 0;
+    if (layer == sLayer && curFrame != NULL)
         return curFrame;
     else
         return NULL;
@@ -227,6 +309,42 @@
     curFrame.m_next = curFrame.m_prev = NULL;
 }
 
+
+Frame* PicList::removeFrame(Frame& curFrame)
+{
+    Frame* tmp = &curFrame;
+#if _DEBUG
+    tmp = m_start;
+    while (tmp && tmp != &curFrame)
+    {
+        tmp = tmp->m_next;
+    }
+
+    X265_CHECK(tmp == &curFrame, "piclist: pic being removed was not in list\n"); // verify pic is in this list
+#endif
+
+    m_count--;
+    if (m_count)
+    {
+        if (m_start == &curFrame)
+            m_start = curFrame.m_next;
+        if (m_end == &curFrame)
+            m_end = curFrame.m_prev;
+
+        if (curFrame.m_next)
+            curFrame.m_next->m_prev = curFrame.m_prev;
+        if (curFrame.m_prev)
+            curFrame.m_prev->m_next = curFrame.m_next;
+    }
+    else
+    {
+        m_start = m_end = NULL;
+    }
+
+    curFrame.m_next = curFrame.m_prev = NULL;
+    return tmp;
+}
+
 void PicList::removeMCSTF(Frame& curFrame)
 {
 #if _DEBUG
​

x265_3.6.tar.gz/source/common/piclist.h -> x265_4.0.tar.gz/source/common/piclist.h Changed

@@ -50,10 +50,16 @@
     /** Push picture to end of the list */
     void pushBack(Frame& pic);
     void pushBackMCSTF(Frame& pic);
+#if ENABLE_MULTIVIEW
+    void pushBackSubDPB(Frame& pic);
+#endif
 
     /** Push picture to beginning of the list */
     void pushFront(Frame& pic);
     void pushFrontMCSTF(Frame& pic);
+#if ENABLE_MULTIVIEW
+    Frame* popFrontSubDPB();
+#endif
 
     /** Pop picture from end of the list */
     Frame* popBack();
@@ -63,17 +69,24 @@
     Frame* popFront();
 
     /** Find frame with specified POC */
-    Frame* getPOC(int poc);
+    Frame* getPOC(int poc, int sLayerId = 0);
     /* Find next MCSTF frame with specified POC */
     Frame* getPOCMCSTF(int poc);
 
     /** Get the current Frame from the list **/
-    Frame* getCurFrame(void);
+    Frame* getCurFrame(int sLayer);
 
     /** Remove picture from list */
     void remove(Frame& pic);
+
+    /** Remove picture from list */
+    Frame* removeFrame(Frame& pic);
     /* Remove MCSTF picture from list */
     void removeMCSTF(Frame& pic);
+#if ENABLE_MULTIVIEW
+    /** Remove picture from Sub list */
+    void removeSubDPB(Frame& pic);
+#endif
 
     Frame* first()        { return m_start;   }

 
@@ -50,10 +50,16 @@
     /** Push picture to end of the list */
     void pushBack(Frame& pic);
     void pushBackMCSTF(Frame& pic);
+#if ENABLE_MULTIVIEW
+    void pushBackSubDPB(Frame& pic);
+#endif
 
     /** Push picture to beginning of the list */
     void pushFront(Frame& pic);
     void pushFrontMCSTF(Frame& pic);
+#if ENABLE_MULTIVIEW
+    Frame* popFrontSubDPB();
+#endif
 
     /** Pop picture from end of the list */
     Frame* popBack();
@@ -63,17 +69,24 @@
     Frame* popFront();
 
     /** Find frame with specified POC */
-    Frame* getPOC(int poc);
+    Frame* getPOC(int poc, int sLayerId = 0);
     /* Find next MCSTF frame with specified POC */
     Frame* getPOCMCSTF(int poc);
 
     /** Get the current Frame from the list **/
-    Frame* getCurFrame(void);
+    Frame* getCurFrame(int sLayer);
 
     /** Remove picture from list */
     void remove(Frame& pic);
+
+    /** Remove picture from list */
+    Frame* removeFrame(Frame& pic);
     /* Remove MCSTF picture from list */
     void removeMCSTF(Frame& pic);
+#if ENABLE_MULTIVIEW
+    /** Remove picture from Sub list */
+    void removeSubDPB(Frame& pic);
+#endif
 
     Frame* first()        { return m_start;   }
 
​

x265_3.6.tar.gz/source/common/picyuv.cpp -> x265_4.0.tar.gz/source/common/picyuv.cpp Changed

@@ -258,7 +258,7 @@
 
 /* Copy pixels from an x265_picture into internal PicYuv instance.
  * Shift pixels as necessary, mask off bits above X265_DEPTH for safety. */
-void PicYuv::copyFromPicture(const x265_picture& pic, const x265_param& param, int padx, int pady)
+void PicYuv::copyFromPicture(const x265_picture& pic, const x265_param& param, int padx, int pady, bool isBase)
 {
     /* m_picWidth is the width that is being encoded, padx indicates how many
      * of those pixels are padding to reach multiple of MinCU(4) size.
@@ -321,78 +321,157 @@
 #else /* Case for (X265_DEPTH == 8) */
             // TODO: Does we need this path? may merge into above in future
         {
-            pixel *yPixel = m_picOrg0;
-            uint8_t *yChar = (uint8_t*)pic.planes0;
-
-            for (int r = 0; r < height; r++)
+            if (isBase || param.numViews > 1)
             {
-                memcpy(yPixel, yChar, width * sizeof(pixel));
+                int offsetX, offsetY;
+                offsetX = (!isBase && pic.format == 1 ? width : 0);
+                offsetY = (!isBase && pic.format == 2 ? pic.stride0 * height : 0);
+                pixel *yPixel = m_picOrg0;
+                uint8_t* yChar = (uint8_t*)pic.planes0 + offsetX + offsetY;
 
-                yPixel += m_stride;
-                yChar += pic.stride0 / sizeof(*yChar);
-            }
+                for (int r = 0; r < height; r++)
+                {
+                    memcpy(yPixel, yChar, width * sizeof(pixel));
 
-            if (param.internalCsp != X265_CSP_I400)
+                    yPixel += m_stride;
+                    yChar += pic.stride0 / sizeof(*yChar);
+                }
+
+                if (param.internalCsp != X265_CSP_I400)
+                {
+                    offsetX = offsetX >> m_hChromaShift;
+                    int offsetYU = (!isBase && pic.format == 2 ? pic.stride1 * (height >> m_vChromaShift) : 0);
+                    int offsetYV = (!isBase && pic.format == 2 ? pic.stride2 * (height >> m_vChromaShift) : 0);
+
+                    pixel *uPixel = m_picOrg1;
+                    pixel *vPixel = m_picOrg2;
+
+                    uint8_t* uChar = (uint8_t*)pic.planes1 + offsetX + offsetYU;
+                    uint8_t* vChar = (uint8_t*)pic.planes2 + offsetX + offsetYV;
+
+                    for (int r = 0; r < height >> m_vChromaShift; r++)
+                    {
+                        memcpy(uPixel, uChar, (width >> m_hChromaShift) * sizeof(pixel));
+                        memcpy(vPixel, vChar, (width >> m_hChromaShift) * sizeof(pixel));
+
+                        uPixel += m_strideC;
+                        vPixel += m_strideC;
+                        uChar += pic.stride1 / sizeof(*uChar);
+                        vChar += pic.stride2 / sizeof(*vChar);
+                    }
+                }
+            }
+#if ENABLE_ALPHA
+            if (!isBase && param.bEnableAlpha)
             {
-                pixel *uPixel = m_picOrg1;
-                pixel *vPixel = m_picOrg2;
+                pixel* aPixel = m_picOrg0;
+                uint8_t* aChar = (uint8_t*)pic.planes3;
 
-                uint8_t *uChar = (uint8_t*)pic.planes1;
-                uint8_t *vChar = (uint8_t*)pic.planes2;
+                for (int r = 0; r < height; r++)
+                {
+                    memcpy(aPixel, aChar, width * sizeof(pixel));
+
+                    aPixel += m_stride;
+                    aChar += pic.stride0 / sizeof(*aChar);
+                }
+
+                pixel* uPixel = m_picOrg1;
+                pixel* vPixel = m_picOrg2;
 
                 for (int r = 0; r < height >> m_vChromaShift; r++)
                 {
-                    memcpy(uPixel, uChar, (width >> m_hChromaShift) * sizeof(pixel));
-                    memcpy(vPixel, vChar, (width >> m_hChromaShift) * sizeof(pixel));
+                    memset(uPixel, 128, (width >> m_hChromaShift) * sizeof(pixel));
+                    memset(vPixel, 128, (width >> m_hChromaShift) * sizeof(pixel));
 
                     uPixel += m_strideC;
                     vPixel += m_strideC;
-                    uChar += pic.stride1 / sizeof(*uChar);
-                    vChar += pic.stride2 / sizeof(*vChar);
                 }
             }
+#endif
         }
 #endif /* (X265_DEPTH > 8) */
         }
         else /* pic.bitDepth > 8 */
         {
             /* defensive programming, mask off bits that are supposed to be zero */
-            uint16_t mask = (1 << X265_DEPTH) - 1;
-            int shift = abs(pic.bitDepth - X265_DEPTH);
-            pixel *yPixel = m_picOrg0;
+            if (isBase)
+            {
+                uint16_t mask = (1 << X265_DEPTH) - 1;
+                int shift = abs(pic.bitDepth - X265_DEPTH);
+                pixel* yPixel = m_picOrg0;
 
-            uint16_t *yShort = (uint16_t*)pic.planes0;
+                uint16_t* yShort = (uint16_t*)pic.planes0;
 
-            if (pic.bitDepth > X265_DEPTH)
-            {
-                /* shift right and mask pixels to final size */
-                primitives.planecopy_sp(yShort, pic.stride0 / sizeof(*yShort), yPixel, m_stride, width, height, shift, mask);
-            }
-            else /* Case for (pic.bitDepth <= X265_DEPTH) */
-            {
-                /* shift left and mask pixels to final size */
-                primitives.planecopy_sp_shl(yShort, pic.stride0 / sizeof(*yShort), yPixel, m_stride, width, height, shift, mask);
-            }
+                if (pic.bitDepth > X265_DEPTH)
+                {
+                    /* shift right and mask pixels to final size */
+                    primitives.planecopy_sp(yShort, pic.stride0 / sizeof(*yShort), yPixel, m_stride, width, height, shift, mask);
+                }
+                else /* Case for (pic.bitDepth <= X265_DEPTH) */
+                {
+                    /* shift left and mask pixels to final size */
+                    primitives.planecopy_sp_shl(yShort, pic.stride0 / sizeof(*yShort), yPixel, m_stride, width, height, shift, mask);
+                }
 
-            if (param.internalCsp != X265_CSP_I400)
+                if (param.internalCsp != X265_CSP_I400)
+                {
+                    pixel* uPixel = m_picOrg1;
+                    pixel* vPixel = m_picOrg2;
+
+                    uint16_t* uShort = (uint16_t*)pic.planes1;
+                    uint16_t* vShort = (uint16_t*)pic.planes2;
+
+                    if (pic.bitDepth > X265_DEPTH)
+                    {
+                        primitives.planecopy_sp(uShort, pic.stride1 / sizeof(*uShort), uPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
+                        primitives.planecopy_sp(vShort, pic.stride2 / sizeof(*vShort), vPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
+                    }
+                    else /* Case for (pic.bitDepth <= X265_DEPTH) */
+                    {
+                        primitives.planecopy_sp_shl(uShort, pic.stride1 / sizeof(*uShort), uPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
+                        primitives.planecopy_sp_shl(vShort, pic.stride2 / sizeof(*vShort), vPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
+                    }
+                }
+            }
+#if ENABLE_ALPHA
+            if (!isBase && param.bEnableAlpha)
             {
-                pixel *uPixel = m_picOrg1;
-                pixel *vPixel = m_picOrg2;
+                /* defensive programming, mask off bits that are supposed to be zero */
+                uint16_t mask = (1 << X265_DEPTH) - 1;
+                int shift = abs(pic.bitDepth - X265_DEPTH);
+                pixel* yPixel = m_picOrg0;
 
-                uint16_t *uShort = (uint16_t*)pic.planes1;
-                uint16_t *vShort = (uint16_t*)pic.planes2;
+                uint16_t* yShort = (uint16_t*)pic.planes3;
 
                 if (pic.bitDepth > X265_DEPTH)
                 {
-                    primitives.planecopy_sp(uShort, pic.stride1 / sizeof(*uShort), uPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
-                    primitives.planecopy_sp(vShort, pic.stride2 / sizeof(*vShort), vPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
+                    /* shift right and mask pixels to final size */
+                    primitives.planecopy_sp(yShort, pic.stride0 / sizeof(*yShort), yPixel, m_stride, width, height, shift, mask);
                 }
                 else /* Case for (pic.bitDepth <= X265_DEPTH) */
                 {
-                    primitives.planecopy_sp_shl(uShort, pic.stride1 / sizeof(*uShort), uPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
-                    primitives.planecopy_sp_shl(vShort, pic.stride2 / sizeof(*vShort), vPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
+                    /* shift left and mask pixels to final size */
+                    primitives.planecopy_sp_shl(yShort, pic.stride0 / sizeof(*yShort), yPixel, m_stride, width, height, shift, mask);
+                }
+
+                if (param.internalCsp != X265_CSP_I400)
+                {
+                    pixel* uPixel = m_picOrg1;
+                    pixel* vPixel = m_picOrg2;
+
+                    for (int r = 0; r < height >> m_vChromaShift; r++)
+                    {
+                        for (int c = 0; c < (width >> m_hChromaShift); c++)
+                        {
+                            uPixelc = ((1 << X265_DEPTH) >> 1);
+                            vPixelc = ((1 << X265_DEPTH) >> 1);
+                        }
+                        uPixel += m_strideC;
+                        vPixel += m_strideC;

 
@@ -258,7 +258,7 @@
 
 /* Copy pixels from an x265_picture into internal PicYuv instance.
  * Shift pixels as necessary, mask off bits above X265_DEPTH for safety. */
-void PicYuv::copyFromPicture(const x265_picture& pic, const x265_param& param, int padx, int pady)
+void PicYuv::copyFromPicture(const x265_picture& pic, const x265_param& param, int padx, int pady, bool isBase)
 {
     /* m_picWidth is the width that is being encoded, padx indicates how many
      * of those pixels are padding to reach multiple of MinCU(4) size.
@@ -321,78 +321,157 @@
 #else /* Case for (X265_DEPTH == 8) */
             // TODO: Does we need this path? may merge into above in future
         {
-            pixel *yPixel = m_picOrg0;
-            uint8_t *yChar = (uint8_t*)pic.planes0;
-
-            for (int r = 0; r < height; r++)
+            if (isBase || param.numViews > 1)
             {
-                memcpy(yPixel, yChar, width * sizeof(pixel));
+                int offsetX, offsetY;
+                offsetX = (!isBase && pic.format == 1 ? width : 0);
+                offsetY = (!isBase && pic.format == 2 ? pic.stride0 * height : 0);
+                pixel *yPixel = m_picOrg0;
+                uint8_t* yChar = (uint8_t*)pic.planes0 + offsetX + offsetY;
 
-                yPixel += m_stride;
-                yChar += pic.stride0 / sizeof(*yChar);
-            }
+                for (int r = 0; r < height; r++)
+                {
+                    memcpy(yPixel, yChar, width * sizeof(pixel));
 
-            if (param.internalCsp != X265_CSP_I400)
+                    yPixel += m_stride;
+                    yChar += pic.stride0 / sizeof(*yChar);
+                }
+
+                if (param.internalCsp != X265_CSP_I400)
+                {
+                    offsetX = offsetX >> m_hChromaShift;
+                    int offsetYU = (!isBase && pic.format == 2 ? pic.stride1 * (height >> m_vChromaShift) : 0);
+                    int offsetYV = (!isBase && pic.format == 2 ? pic.stride2 * (height >> m_vChromaShift) : 0);
+
+                    pixel *uPixel = m_picOrg1;
+                    pixel *vPixel = m_picOrg2;
+
+                    uint8_t* uChar = (uint8_t*)pic.planes1 + offsetX + offsetYU;
+                    uint8_t* vChar = (uint8_t*)pic.planes2 + offsetX + offsetYV;
+
+                    for (int r = 0; r < height >> m_vChromaShift; r++)
+                    {
+                        memcpy(uPixel, uChar, (width >> m_hChromaShift) * sizeof(pixel));
+                        memcpy(vPixel, vChar, (width >> m_hChromaShift) * sizeof(pixel));
+
+                        uPixel += m_strideC;
+                        vPixel += m_strideC;
+                        uChar += pic.stride1 / sizeof(*uChar);
+                        vChar += pic.stride2 / sizeof(*vChar);
+                    }
+                }
+            }
+#if ENABLE_ALPHA
+            if (!isBase && param.bEnableAlpha)
             {
-                pixel *uPixel = m_picOrg1;
-                pixel *vPixel = m_picOrg2;
+                pixel* aPixel = m_picOrg0;
+                uint8_t* aChar = (uint8_t*)pic.planes3;
 
-                uint8_t *uChar = (uint8_t*)pic.planes1;
-                uint8_t *vChar = (uint8_t*)pic.planes2;
+                for (int r = 0; r < height; r++)
+                {
+                    memcpy(aPixel, aChar, width * sizeof(pixel));
+
+                    aPixel += m_stride;
+                    aChar += pic.stride0 / sizeof(*aChar);
+                }
+
+                pixel* uPixel = m_picOrg1;
+                pixel* vPixel = m_picOrg2;
 
                 for (int r = 0; r < height >> m_vChromaShift; r++)
                 {
-                    memcpy(uPixel, uChar, (width >> m_hChromaShift) * sizeof(pixel));
-                    memcpy(vPixel, vChar, (width >> m_hChromaShift) * sizeof(pixel));
+                    memset(uPixel, 128, (width >> m_hChromaShift) * sizeof(pixel));
+                    memset(vPixel, 128, (width >> m_hChromaShift) * sizeof(pixel));
 
                     uPixel += m_strideC;
                     vPixel += m_strideC;
-                    uChar += pic.stride1 / sizeof(*uChar);
-                    vChar += pic.stride2 / sizeof(*vChar);
                 }
             }
+#endif
         }
 #endif /* (X265_DEPTH > 8) */
         }
         else /* pic.bitDepth > 8 */
         {
             /* defensive programming, mask off bits that are supposed to be zero */
-            uint16_t mask = (1 << X265_DEPTH) - 1;
-            int shift = abs(pic.bitDepth - X265_DEPTH);
-            pixel *yPixel = m_picOrg0;
+            if (isBase)
+            {
+                uint16_t mask = (1 << X265_DEPTH) - 1;
+                int shift = abs(pic.bitDepth - X265_DEPTH);
+                pixel* yPixel = m_picOrg0;
 
-            uint16_t *yShort = (uint16_t*)pic.planes0;
+                uint16_t* yShort = (uint16_t*)pic.planes0;
 
-            if (pic.bitDepth > X265_DEPTH)
-            {
-                /* shift right and mask pixels to final size */
-                primitives.planecopy_sp(yShort, pic.stride0 / sizeof(*yShort), yPixel, m_stride, width, height, shift, mask);
-            }
-            else /* Case for (pic.bitDepth <= X265_DEPTH) */
-            {
-                /* shift left and mask pixels to final size */
-                primitives.planecopy_sp_shl(yShort, pic.stride0 / sizeof(*yShort), yPixel, m_stride, width, height, shift, mask);
-            }
+                if (pic.bitDepth > X265_DEPTH)
+                {
+                    /* shift right and mask pixels to final size */
+                    primitives.planecopy_sp(yShort, pic.stride0 / sizeof(*yShort), yPixel, m_stride, width, height, shift, mask);
+                }
+                else /* Case for (pic.bitDepth <= X265_DEPTH) */
+                {
+                    /* shift left and mask pixels to final size */
+                    primitives.planecopy_sp_shl(yShort, pic.stride0 / sizeof(*yShort), yPixel, m_stride, width, height, shift, mask);
+                }
 
-            if (param.internalCsp != X265_CSP_I400)
+                if (param.internalCsp != X265_CSP_I400)
+                {
+                    pixel* uPixel = m_picOrg1;
+                    pixel* vPixel = m_picOrg2;
+
+                    uint16_t* uShort = (uint16_t*)pic.planes1;
+                    uint16_t* vShort = (uint16_t*)pic.planes2;
+
+                    if (pic.bitDepth > X265_DEPTH)
+                    {
+                        primitives.planecopy_sp(uShort, pic.stride1 / sizeof(*uShort), uPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
+                        primitives.planecopy_sp(vShort, pic.stride2 / sizeof(*vShort), vPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
+                    }
+                    else /* Case for (pic.bitDepth <= X265_DEPTH) */
+                    {
+                        primitives.planecopy_sp_shl(uShort, pic.stride1 / sizeof(*uShort), uPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
+                        primitives.planecopy_sp_shl(vShort, pic.stride2 / sizeof(*vShort), vPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
+                    }
+                }
+            }
+#if ENABLE_ALPHA
+            if (!isBase && param.bEnableAlpha)
             {
-                pixel *uPixel = m_picOrg1;
-                pixel *vPixel = m_picOrg2;
+                /* defensive programming, mask off bits that are supposed to be zero */
+                uint16_t mask = (1 << X265_DEPTH) - 1;
+                int shift = abs(pic.bitDepth - X265_DEPTH);
+                pixel* yPixel = m_picOrg0;
 
-                uint16_t *uShort = (uint16_t*)pic.planes1;
-                uint16_t *vShort = (uint16_t*)pic.planes2;
+                uint16_t* yShort = (uint16_t*)pic.planes3;
 
                 if (pic.bitDepth > X265_DEPTH)
                 {
-                    primitives.planecopy_sp(uShort, pic.stride1 / sizeof(*uShort), uPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
-                    primitives.planecopy_sp(vShort, pic.stride2 / sizeof(*vShort), vPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
+                    /* shift right and mask pixels to final size */
+                    primitives.planecopy_sp(yShort, pic.stride0 / sizeof(*yShort), yPixel, m_stride, width, height, shift, mask);
                 }
                 else /* Case for (pic.bitDepth <= X265_DEPTH) */
                 {
-                    primitives.planecopy_sp_shl(uShort, pic.stride1 / sizeof(*uShort), uPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
-                    primitives.planecopy_sp_shl(vShort, pic.stride2 / sizeof(*vShort), vPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
+                    /* shift left and mask pixels to final size */
+                    primitives.planecopy_sp_shl(yShort, pic.stride0 / sizeof(*yShort), yPixel, m_stride, width, height, shift, mask);
+                }
+
+                if (param.internalCsp != X265_CSP_I400)
+                {
+                    pixel* uPixel = m_picOrg1;
+                    pixel* vPixel = m_picOrg2;
+
+                    for (int r = 0; r < height >> m_vChromaShift; r++)
+                    {
+                        for (int c = 0; c < (width >> m_hChromaShift); c++)
+                        {
+                            uPixelc = ((1 << X265_DEPTH) >> 1);
+                            vPixelc = ((1 << X265_DEPTH) >> 1);
+                        }
+                        uPixel += m_strideC;
+                        vPixel += m_strideC;
​

x265_3.6.tar.gz/source/common/picyuv.h -> x265_4.0.tar.gz/source/common/picyuv.h Changed

 
@@ -83,7 +83,7 @@
     void  destroy();
     int   getLumaBufLen(uint32_t picWidth, uint32_t picHeight, uint32_t picCsp);
 
-    void  copyFromPicture(const x265_picture&, const x265_param& param, int padx, int pady);
+    void  copyFromPicture(const x265_picture&, const x265_param& param, int padx, int pady, bool isBase = true);
     void  copyFromFrame(PicYuv* source);
 
     intptr_t getChromaAddrOffset(uint32_t ctuAddr, uint32_t absPartIdx) const { return m_cuOffsetCctuAddr + m_buOffsetCabsPartIdx; }
​

x265_3.6.tar.gz/source/common/pixel.cpp -> x265_4.0.tar.gz/source/common/pixel.cpp Changed

 
@@ -266,10 +266,6 @@
 {
     int satd = 0;
 
-#if ENABLE_ASSEMBLY && X265_ARCH_ARM64 && !HIGH_BIT_DEPTH
-    pixelcmp_t satd_4x4 = x265_pixel_satd_4x4_neon;
-#endif
-
     for (int row = 0; row < h; row += 4)
         for (int col = 0; col < w; col += 4)
             satd += satd_4x4(pix1 + row * stride_pix1 + col, stride_pix1,
@@ -284,10 +280,6 @@
 {
     int satd = 0;
 
-#if ENABLE_ASSEMBLY && X265_ARCH_ARM64 && !HIGH_BIT_DEPTH
-    pixelcmp_t satd_8x4 = x265_pixel_satd_8x4_neon;
-#endif
-
     for (int row = 0; row < h; row += 4)
         for (int col = 0; col < w; col += 8)
             satd += satd_8x4(pix1 + row * stride_pix1 + col, stride_pix1,
​

x265_3.6.tar.gz/source/common/predict.cpp -> x265_4.0.tar.gz/source/common/predict.cpp Changed

@@ -112,10 +112,22 @@
         }
         else
         {
-            if (bLuma)
-                predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList0refIdx0, mv0);
-            if (bChroma)
-                predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList0refIdx0, mv0);
+#if ENABLE_SCC_EXT
+            if (cu.m_slice->m_param->bEnableSCC && refIdx0 == (cu.m_slice->m_numRefIdx0 - 1))
+            {
+                if (bLuma)
+                    predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refFrameList0refIdx0->m_reconPic1, mv0);
+                if (bChroma)
+                    predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refFrameList0refIdx0->m_reconPic1, mv0);
+            }
+            else
+#endif
+            {
+                if (bLuma)
+                    predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList0refIdx0, mv0);
+                if (bChroma)
+                    predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList0refIdx0, mv0);
+            }
         }
     }
     else
@@ -174,12 +186,22 @@
 
             if (bLuma)
             {
-                predInterLumaShort(pu, m_predShortYuv0, *cu.m_slice->m_refReconPicList0refIdx0, mv0);
+#if ENABLE_SCC_EXT
+                if (cu.m_slice->m_param->bEnableSCC && refIdx0 == (cu.m_slice->m_numRefIdx0 - 1))
+                    predInterLumaShort(pu, m_predShortYuv0, *cu.m_slice->m_refFrameList0refIdx0->m_reconPic1, mv0);
+                else
+#endif
+                    predInterLumaShort(pu, m_predShortYuv0, *cu.m_slice->m_refReconPicList0refIdx0, mv0);
                 predInterLumaShort(pu, m_predShortYuv1, *cu.m_slice->m_refReconPicList1refIdx1, mv1);
             }
             if (bChroma)
             {
-                predInterChromaShort(pu, m_predShortYuv0, *cu.m_slice->m_refReconPicList0refIdx0, mv0);
+#if ENABLE_SCC_EXT
+                if (cu.m_slice->m_param->bEnableSCC && refIdx0 == (cu.m_slice->m_numRefIdx0 - 1))
+                    predInterChromaShort(pu, m_predShortYuv0, *cu.m_slice->m_refFrameList0refIdx0->m_reconPic1, mv0);
+                else
+#endif
+                    predInterChromaShort(pu, m_predShortYuv0, *cu.m_slice->m_refReconPicList0refIdx0, mv0);
                 predInterChromaShort(pu, m_predShortYuv1, *cu.m_slice->m_refReconPicList1refIdx1, mv1);
             }
 
@@ -206,10 +228,22 @@
             }
             else
             {
-                if (bLuma)
-                    predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList0refIdx0, mv0);
-                if (bChroma)
-                    predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList0refIdx0, mv0);
+#if ENABLE_SCC_EXT
+                if (cu.m_slice->m_param->bEnableSCC && refIdx0 == (cu.m_slice->m_numRefIdx0 - 1))
+                {
+                    if (bLuma)
+                        predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refFrameList0refIdx0->m_reconPic1, mv0);
+                    if (bChroma)
+                        predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refFrameList0refIdx0->m_reconPic1, mv0);
+                }
+                else
+#endif
+                {
+                    if (bLuma)
+                        predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList0refIdx0, mv0);
+                    if (bChroma)
+                        predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList0refIdx0, mv0);
+                }
             }
         }
         else
@@ -602,7 +636,7 @@
     int tuSize = 1 << intraNeighbors.log2TrSize;
     int tuSize2 = tuSize << 1;
 
-    PicYuv* reconPic = cu.m_encData->m_reconPic;
+    PicYuv* reconPic = cu.m_encData->m_reconPic0;
     pixel* adiOrigin = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + puAbsPartIdx);
     intptr_t picStride = reconPic->m_stride;
 
@@ -651,7 +685,7 @@
 
 void Predict::initAdiPatternChroma(const CUData& cu, const CUGeom& cuGeom, uint32_t puAbsPartIdx, const IntraNeighbors& intraNeighbors, uint32_t chromaId)
 {
-    PicYuv* reconPic = cu.m_encData->m_reconPic;
+    PicYuv* reconPic = cu.m_encData->m_reconPic0;
     const pixel* adiOrigin = reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + puAbsPartIdx);
     intptr_t picStride = reconPic->m_strideC;

 
@@ -112,10 +112,22 @@
         }
         else
         {
-            if (bLuma)
-                predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList0refIdx0, mv0);
-            if (bChroma)
-                predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList0refIdx0, mv0);
+#if ENABLE_SCC_EXT
+            if (cu.m_slice->m_param->bEnableSCC && refIdx0 == (cu.m_slice->m_numRefIdx0 - 1))
+            {
+                if (bLuma)
+                    predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refFrameList0refIdx0->m_reconPic1, mv0);
+                if (bChroma)
+                    predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refFrameList0refIdx0->m_reconPic1, mv0);
+            }
+            else
+#endif
+            {
+                if (bLuma)
+                    predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList0refIdx0, mv0);
+                if (bChroma)
+                    predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList0refIdx0, mv0);
+            }
         }
     }
     else
@@ -174,12 +186,22 @@
 
             if (bLuma)
             {
-                predInterLumaShort(pu, m_predShortYuv0, *cu.m_slice->m_refReconPicList0refIdx0, mv0);
+#if ENABLE_SCC_EXT
+                if (cu.m_slice->m_param->bEnableSCC && refIdx0 == (cu.m_slice->m_numRefIdx0 - 1))
+                    predInterLumaShort(pu, m_predShortYuv0, *cu.m_slice->m_refFrameList0refIdx0->m_reconPic1, mv0);
+                else
+#endif
+                    predInterLumaShort(pu, m_predShortYuv0, *cu.m_slice->m_refReconPicList0refIdx0, mv0);
                 predInterLumaShort(pu, m_predShortYuv1, *cu.m_slice->m_refReconPicList1refIdx1, mv1);
             }
             if (bChroma)
             {
-                predInterChromaShort(pu, m_predShortYuv0, *cu.m_slice->m_refReconPicList0refIdx0, mv0);
+#if ENABLE_SCC_EXT
+                if (cu.m_slice->m_param->bEnableSCC && refIdx0 == (cu.m_slice->m_numRefIdx0 - 1))
+                    predInterChromaShort(pu, m_predShortYuv0, *cu.m_slice->m_refFrameList0refIdx0->m_reconPic1, mv0);
+                else
+#endif
+                    predInterChromaShort(pu, m_predShortYuv0, *cu.m_slice->m_refReconPicList0refIdx0, mv0);
                 predInterChromaShort(pu, m_predShortYuv1, *cu.m_slice->m_refReconPicList1refIdx1, mv1);
             }
 
@@ -206,10 +228,22 @@
             }
             else
             {
-                if (bLuma)
-                    predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList0refIdx0, mv0);
-                if (bChroma)
-                    predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList0refIdx0, mv0);
+#if ENABLE_SCC_EXT
+                if (cu.m_slice->m_param->bEnableSCC && refIdx0 == (cu.m_slice->m_numRefIdx0 - 1))
+                {
+                    if (bLuma)
+                        predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refFrameList0refIdx0->m_reconPic1, mv0);
+                    if (bChroma)
+                        predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refFrameList0refIdx0->m_reconPic1, mv0);
+                }
+                else
+#endif
+                {
+                    if (bLuma)
+                        predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList0refIdx0, mv0);
+                    if (bChroma)
+                        predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList0refIdx0, mv0);
+                }
             }
         }
         else
@@ -602,7 +636,7 @@
     int tuSize = 1 << intraNeighbors.log2TrSize;
     int tuSize2 = tuSize << 1;
 
-    PicYuv* reconPic = cu.m_encData->m_reconPic;
+    PicYuv* reconPic = cu.m_encData->m_reconPic0;
     pixel* adiOrigin = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + puAbsPartIdx);
     intptr_t picStride = reconPic->m_stride;
 
@@ -651,7 +685,7 @@
 
 void Predict::initAdiPatternChroma(const CUData& cu, const CUGeom& cuGeom, uint32_t puAbsPartIdx, const IntraNeighbors& intraNeighbors, uint32_t chromaId)
 {
-    PicYuv* reconPic = cu.m_encData->m_reconPic;
+    PicYuv* reconPic = cu.m_encData->m_reconPic0;
     const pixel* adiOrigin = reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + puAbsPartIdx);
     intptr_t picStride = reconPic->m_strideC;
 
​

x265_3.6.tar.gz/source/common/primitives.cpp -> x265_4.0.tar.gz/source/common/primitives.cpp Changed

 
@@ -258,8 +258,8 @@
             primitives.cui.intra_pred_allangs = NULL;
 
 #if ENABLE_ASSEMBLY
-#if X265_ARCH_X86
-        setupInstrinsicPrimitives(primitives, param->cpuid);
+#if defined(X265_ARCH_X86) || defined(X265_ARCH_ARM64)
+        setupIntrinsicPrimitives(primitives, param->cpuid);
 #endif
         setupAssemblyPrimitives(primitives, param->cpuid);
 #endif
​

x265_3.6.tar.gz/source/common/primitives.h -> x265_4.0.tar.gz/source/common/primitives.h Changed

 
@@ -470,12 +470,9 @@
 }
 
 void setupCPrimitives(EncoderPrimitives &p);
-void setupInstrinsicPrimitives(EncoderPrimitives &p, int cpuMask);
+void setupIntrinsicPrimitives(EncoderPrimitives &p, int cpuMask);
 void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask);
 void setupAliasPrimitives(EncoderPrimitives &p);
-#if X265_ARCH_ARM64
-void setupAliasCPrimitives(EncoderPrimitives &cp, EncoderPrimitives &asmp, int cpuMask);
-#endif
 #if HAVE_ALTIVEC
 void setupPixelPrimitives_altivec(EncoderPrimitives &p);
 void setupDCTPrimitives_altivec(EncoderPrimitives &p);
​

x265_3.6.tar.gz/source/common/slice.cpp -> x265_4.0.tar.gz/source/common/slice.cpp Changed

@@ -29,17 +29,83 @@
 
 using namespace X265_NS;
 
-void Slice::setRefPicList(PicList& picList)
+#if ENABLE_MULTIVIEW
+void Slice::createInterLayerReferencePictureSet(PicList& picList, PicList& refPicSetInterLayer0, PicList& refPicSetInterLayer1)
 {
+
+    for (int i = 0; i < 1; i++)
+    {
+        int layerIdRef = 0;// getRefPicLayerId(i);
+        Frame* refPic = picList.getPOC(m_poc, 0);
+        int viewIdCur = 0;
+        int viewIdZero = 1;
+        int viewIdRef = 1;
+
+        if ((viewIdCur <= viewIdZero && viewIdCur <= viewIdRef) || (viewIdCur >= viewIdZero && viewIdCur >= viewIdRef))
+        {
+            refPicSetInterLayer0.pushBackSubDPB(*refPic);
+        }
+        else
+        {
+            refPicSetInterLayer1.pushBackSubDPB(*refPic);
+        }
+    }
+}
+#endif
+
+void Slice::setRefPicList(PicList& picList, PicList& refPicSetInterLayer0, PicList& refPicSetInterLayer1, int sLayerId)
+{
+    bool checkNumPocTotalCurr = m_param->bEnableSCC ? false : true;
     if (m_sliceType == I_SLICE)
     {
         memset(m_refFrameList, 0, sizeof(m_refFrameList));
         memset(m_refReconPicList, 0, sizeof(m_refReconPicList));
         memset(m_refPOCList, 0, sizeof(m_refPOCList));
         m_numRefIdx1 = m_numRefIdx0 = 0;
+
+#if ENABLE_SCC_EXT
+        if (!checkNumPocTotalCurr)
+        {
+            if (m_rps.numberOfPictures == 0)
+            {
+                Frame* prevPic = picList.getPOC(X265_MAX(0, m_poc - 1));
+                if (prevPic->m_poc != X265_MAX(0, m_poc - 1))
+                {
+                    prevPic = picList.getPOC(m_poc);
+                }
+                m_lastEncPic = prevPic;
+            }
+            return;
+        }
+#endif
+
         return;
     }
 
+#if ENABLE_SCC_EXT || ENABLE_MULTIVIEW || ENABLE_ALPHA
+    /*Reset the number of references for I-slice marked as P-slice*/
+    if ((m_param->bEnableSCC || sLayerId) && m_sliceType != m_origSliceType)
+    {
+        memset(m_refFrameList, 0, sizeof(m_refFrameList));
+        memset(m_refReconPicList, 0, sizeof(m_refReconPicList));
+        memset(m_refPOCList, 0, sizeof(m_refPOCList));
+        m_numRefIdx0 = 1;
+    }
+#endif
+
+#if ENABLE_SCC_EXT
+    if (!checkNumPocTotalCurr && m_rps.numberOfPictures == 0)
+    {
+        Frame* prevPic = picList.getPOC(X265_MAX(0, m_poc - 1));
+        if (prevPic->m_poc != X265_MAX(0, m_poc - 1))
+        {
+            prevPic = picList.getPOC(m_poc);
+
+        }
+        m_lastEncPic = prevPic;
+    }
+#endif
+
     Frame* refPic = NULL;
     Frame* refPicSetStCurr0MAX_NUM_REF;
     Frame* refPicSetStCurr1MAX_NUM_REF;
@@ -51,9 +117,9 @@
 
     for (i = 0; i < m_rps.numberOfNegativePictures; i++)
     {
-        if (m_rps.bUsedi)
+        if (m_rps.bUsedi && m_origSliceType != I_SLICE)
         {
-            refPic = picList.getPOC(m_poc + m_rps.deltaPOCi);
+            refPic = picList.getPOC(m_poc + m_rps.deltaPOCi, m_rps.deltaPOCi ? sLayerId : 0);
             refPicSetStCurr0numPocStCurr0 = refPic;
             numPocStCurr0++;
         }
@@ -61,9 +127,9 @@
 
     for (; i < m_rps.numberOfNegativePictures + m_rps.numberOfPositivePictures; i++)
     {
-        if (m_rps.bUsedi)
+        if (m_rps.bUsedi && m_origSliceType != I_SLICE)
         {
-            refPic = picList.getPOC(m_poc + m_rps.deltaPOCi);
+            refPic = picList.getPOC(m_poc + m_rps.deltaPOCi, m_rps.deltaPOCi ? sLayerId : 0);
             refPicSetStCurr1numPocStCurr1 = refPic;
             numPocStCurr1++;
         }
@@ -75,18 +141,44 @@
     // ref_pic_list_init
     Frame* rpsCurrList0MAX_NUM_REF + 1;
     Frame* rpsCurrList1MAX_NUM_REF + 1;
+#if ENABLE_MULTIVIEW
+    int numPocTotalCurr = numPocStCurr0 + numPocStCurr1 + numPocLtCurr + refPicSetInterLayer0.size() + refPicSetInterLayer1.size();
+#else
     int numPocTotalCurr = numPocStCurr0 + numPocStCurr1 + numPocLtCurr;
+#endif
+
+#if ENABLE_SCC_EXT
+    if (m_param->bEnableSCC)
+        numPocTotalCurr++;
+#endif
 
     int cIdx = 0;
     for (i = 0; i < numPocStCurr0; i++, cIdx++)
         rpsCurrList0cIdx = refPicSetStCurr0i;
 
+#if ENABLE_MULTIVIEW
+    if (m_param->numViews > 1)
+        for (i = 0; i < refPicSetInterLayer0.size(); i++, cIdx++)
+            rpsCurrList0cIdx = refPicSetInterLayer0.getPOC(m_poc, 0);
+#endif
+
     for (i = 0; i < numPocStCurr1; i++, cIdx++)
         rpsCurrList0cIdx = refPicSetStCurr1i;
 
     for (i = 0; i < numPocLtCurr; i++, cIdx++)
         rpsCurrList0cIdx = refPicSetLtCurri;
 
+#if ENABLE_MULTIVIEW
+    if (m_param->numViews > 1)
+        for (i = 0; i < refPicSetInterLayer1.size(); i++, cIdx++)
+            rpsCurrList0cIdx = refPicSetInterLayer1.getPOC(m_poc, 0);
+#endif
+
+#if ENABLE_SCC_EXT
+    if (m_param->bEnableSCC)
+        rpsCurrList0cIdx++ = picList.getPOC(m_poc);
+#endif
+
     X265_CHECK(cIdx == numPocTotalCurr, "RPS index check fail\n");
 
     if (m_sliceType == B_SLICE)
@@ -95,12 +187,29 @@
         for (i = 0; i < numPocStCurr1; i++, cIdx++)
             rpsCurrList1cIdx = refPicSetStCurr1i;
 
+#if ENABLE_MULTIVIEW
+        if (m_param->numViews > 1)
+            for (i = 0; i < refPicSetInterLayer1.size(); i++, cIdx++)
+                rpsCurrList1cIdx = refPicSetInterLayer1.getPOC(m_poc, 0);
+#endif
+
         for (i = 0; i < numPocStCurr0; i++, cIdx++)
             rpsCurrList1cIdx = refPicSetStCurr0i;
 
         for (i = 0; i < numPocLtCurr; i++, cIdx++)
             rpsCurrList1cIdx = refPicSetLtCurri;
 
+#if ENABLE_MULTIVIEW
+        if (m_param->numViews > 1)
+            for (i = 0; i < refPicSetInterLayer0.size(); i++, cIdx++)
+                rpsCurrList1cIdx = refPicSetInterLayer0.getPOC(m_poc, 0);
+#endif
+
+#if  ENABLE_SCC_EXT
+        if (m_param->bEnableSCC)
+            rpsCurrList1cIdx++ = picList.getPOC(m_poc);
+#endif
+
         X265_CHECK(cIdx == numPocTotalCurr, "RPS index check fail\n");
     }
 
@@ -109,8 +218,18 @@
         cIdx = rIdx % numPocTotalCurr;
         X265_CHECK(cIdx >= 0 && cIdx < numPocTotalCurr, "RPS index check fail\n");
         m_refFrameList0rIdx = rpsCurrList0cIdx;
+#if ENABLE_MULTIVIEW
+        m_refFrameList0rIdx = rpsCurrList0cIdx;
+#endif
     }
 
+#if  ENABLE_SCC_EXT
+    if (m_param->bEnableSCC && numPocTotalCurr > m_numRefIdx0)
+    {
+        m_refFrameList0m_numRefIdx0 - 1 = picList.getPOC(m_poc);
+    }
+#endif
+

 
@@ -29,17 +29,83 @@
 
 using namespace X265_NS;
 
-void Slice::setRefPicList(PicList& picList)
+#if ENABLE_MULTIVIEW
+void Slice::createInterLayerReferencePictureSet(PicList& picList, PicList& refPicSetInterLayer0, PicList& refPicSetInterLayer1)
 {
+
+    for (int i = 0; i < 1; i++)
+    {
+        int layerIdRef = 0;// getRefPicLayerId(i);
+        Frame* refPic = picList.getPOC(m_poc, 0);
+        int viewIdCur = 0;
+        int viewIdZero = 1;
+        int viewIdRef = 1;
+
+        if ((viewIdCur <= viewIdZero && viewIdCur <= viewIdRef) || (viewIdCur >= viewIdZero && viewIdCur >= viewIdRef))
+        {
+            refPicSetInterLayer0.pushBackSubDPB(*refPic);
+        }
+        else
+        {
+            refPicSetInterLayer1.pushBackSubDPB(*refPic);
+        }
+    }
+}
+#endif
+
+void Slice::setRefPicList(PicList& picList, PicList& refPicSetInterLayer0, PicList& refPicSetInterLayer1, int sLayerId)
+{
+    bool checkNumPocTotalCurr = m_param->bEnableSCC ? false : true;
     if (m_sliceType == I_SLICE)
     {
         memset(m_refFrameList, 0, sizeof(m_refFrameList));
         memset(m_refReconPicList, 0, sizeof(m_refReconPicList));
         memset(m_refPOCList, 0, sizeof(m_refPOCList));
         m_numRefIdx1 = m_numRefIdx0 = 0;
+
+#if ENABLE_SCC_EXT
+        if (!checkNumPocTotalCurr)
+        {
+            if (m_rps.numberOfPictures == 0)
+            {
+                Frame* prevPic = picList.getPOC(X265_MAX(0, m_poc - 1));
+                if (prevPic->m_poc != X265_MAX(0, m_poc - 1))
+                {
+                    prevPic = picList.getPOC(m_poc);
+                }
+                m_lastEncPic = prevPic;
+            }
+            return;
+        }
+#endif
+
         return;
     }
 
+#if ENABLE_SCC_EXT || ENABLE_MULTIVIEW || ENABLE_ALPHA
+    /*Reset the number of references for I-slice marked as P-slice*/
+    if ((m_param->bEnableSCC || sLayerId) && m_sliceType != m_origSliceType)
+    {
+        memset(m_refFrameList, 0, sizeof(m_refFrameList));
+        memset(m_refReconPicList, 0, sizeof(m_refReconPicList));
+        memset(m_refPOCList, 0, sizeof(m_refPOCList));
+        m_numRefIdx0 = 1;
+    }
+#endif
+
+#if ENABLE_SCC_EXT
+    if (!checkNumPocTotalCurr && m_rps.numberOfPictures == 0)
+    {
+        Frame* prevPic = picList.getPOC(X265_MAX(0, m_poc - 1));
+        if (prevPic->m_poc != X265_MAX(0, m_poc - 1))
+        {
+            prevPic = picList.getPOC(m_poc);
+
+        }
+        m_lastEncPic = prevPic;
+    }
+#endif
+
     Frame* refPic = NULL;
     Frame* refPicSetStCurr0MAX_NUM_REF;
     Frame* refPicSetStCurr1MAX_NUM_REF;
@@ -51,9 +117,9 @@
 
     for (i = 0; i < m_rps.numberOfNegativePictures; i++)
     {
-        if (m_rps.bUsedi)
+        if (m_rps.bUsedi && m_origSliceType != I_SLICE)
         {
-            refPic = picList.getPOC(m_poc + m_rps.deltaPOCi);
+            refPic = picList.getPOC(m_poc + m_rps.deltaPOCi, m_rps.deltaPOCi ? sLayerId : 0);
             refPicSetStCurr0numPocStCurr0 = refPic;
             numPocStCurr0++;
         }
@@ -61,9 +127,9 @@
 
     for (; i < m_rps.numberOfNegativePictures + m_rps.numberOfPositivePictures; i++)
     {
-        if (m_rps.bUsedi)
+        if (m_rps.bUsedi && m_origSliceType != I_SLICE)
         {
-            refPic = picList.getPOC(m_poc + m_rps.deltaPOCi);
+            refPic = picList.getPOC(m_poc + m_rps.deltaPOCi, m_rps.deltaPOCi ? sLayerId : 0);
             refPicSetStCurr1numPocStCurr1 = refPic;
             numPocStCurr1++;
         }
@@ -75,18 +141,44 @@
     // ref_pic_list_init
     Frame* rpsCurrList0MAX_NUM_REF + 1;
     Frame* rpsCurrList1MAX_NUM_REF + 1;
+#if ENABLE_MULTIVIEW
+    int numPocTotalCurr = numPocStCurr0 + numPocStCurr1 + numPocLtCurr + refPicSetInterLayer0.size() + refPicSetInterLayer1.size();
+#else
     int numPocTotalCurr = numPocStCurr0 + numPocStCurr1 + numPocLtCurr;
+#endif
+
+#if ENABLE_SCC_EXT
+    if (m_param->bEnableSCC)
+        numPocTotalCurr++;
+#endif
 
     int cIdx = 0;
     for (i = 0; i < numPocStCurr0; i++, cIdx++)
         rpsCurrList0cIdx = refPicSetStCurr0i;
 
+#if ENABLE_MULTIVIEW
+    if (m_param->numViews > 1)
+        for (i = 0; i < refPicSetInterLayer0.size(); i++, cIdx++)
+            rpsCurrList0cIdx = refPicSetInterLayer0.getPOC(m_poc, 0);
+#endif
+
     for (i = 0; i < numPocStCurr1; i++, cIdx++)
         rpsCurrList0cIdx = refPicSetStCurr1i;
 
     for (i = 0; i < numPocLtCurr; i++, cIdx++)
         rpsCurrList0cIdx = refPicSetLtCurri;
 
+#if ENABLE_MULTIVIEW
+    if (m_param->numViews > 1)
+        for (i = 0; i < refPicSetInterLayer1.size(); i++, cIdx++)
+            rpsCurrList0cIdx = refPicSetInterLayer1.getPOC(m_poc, 0);
+#endif
+
+#if ENABLE_SCC_EXT
+    if (m_param->bEnableSCC)
+        rpsCurrList0cIdx++ = picList.getPOC(m_poc);
+#endif
+
     X265_CHECK(cIdx == numPocTotalCurr, "RPS index check fail\n");
 
     if (m_sliceType == B_SLICE)
@@ -95,12 +187,29 @@
         for (i = 0; i < numPocStCurr1; i++, cIdx++)
             rpsCurrList1cIdx = refPicSetStCurr1i;
 
+#if ENABLE_MULTIVIEW
+        if (m_param->numViews > 1)
+            for (i = 0; i < refPicSetInterLayer1.size(); i++, cIdx++)
+                rpsCurrList1cIdx = refPicSetInterLayer1.getPOC(m_poc, 0);
+#endif
+
         for (i = 0; i < numPocStCurr0; i++, cIdx++)
             rpsCurrList1cIdx = refPicSetStCurr0i;
 
         for (i = 0; i < numPocLtCurr; i++, cIdx++)
             rpsCurrList1cIdx = refPicSetLtCurri;
 
+#if ENABLE_MULTIVIEW
+        if (m_param->numViews > 1)
+            for (i = 0; i < refPicSetInterLayer0.size(); i++, cIdx++)
+                rpsCurrList1cIdx = refPicSetInterLayer0.getPOC(m_poc, 0);
+#endif
+
+#if  ENABLE_SCC_EXT
+        if (m_param->bEnableSCC)
+            rpsCurrList1cIdx++ = picList.getPOC(m_poc);
+#endif
+
         X265_CHECK(cIdx == numPocTotalCurr, "RPS index check fail\n");
     }
 
@@ -109,8 +218,18 @@
         cIdx = rIdx % numPocTotalCurr;
         X265_CHECK(cIdx >= 0 && cIdx < numPocTotalCurr, "RPS index check fail\n");
         m_refFrameList0rIdx = rpsCurrList0cIdx;
+#if ENABLE_MULTIVIEW
+        m_refFrameList0rIdx = rpsCurrList0cIdx;
+#endif
     }
 
+#if  ENABLE_SCC_EXT
+    if (m_param->bEnableSCC && numPocTotalCurr > m_numRefIdx0)
+    {
+        m_refFrameList0m_numRefIdx0 - 1 = picList.getPOC(m_poc);
+    }
+#endif
+
​

x265_3.6.tar.gz/source/common/slice.h -> x265_4.0.tar.gz/source/common/slice.h Changed

@@ -73,7 +73,11 @@
         MAIN10 = 2,
         MAINSTILLPICTURE = 3,
         MAINREXT = 4,
-        HIGHTHROUGHPUTREXT = 5
+        HIGHTHROUGHPUTREXT = 5,
+        MULTIVIEWMAIN = 6,
+        SCALABLEMAIN = 7,
+        SCALABLEMAIN10 = 8,
+        MAINSCC = 9
     };
 }
 
@@ -106,7 +110,7 @@
 
 struct ProfileTierLevel
 {
-    int      profileIdc;
+    int      profileIdcMAX_LAYERS;
     int      levelIdc;
     uint32_t minCrForLevel;
     uint32_t maxLumaSrForLevel;
@@ -159,6 +163,27 @@
     uint32_t         numReorderPicsMAX_T_LAYERS;
     uint32_t         maxDecPicBufferingMAX_T_LAYERS;
     uint32_t         maxLatencyIncreaseMAX_T_LAYERS;
+    int              m_numLayers;
+    int              m_numViews;
+    bool             vps_extension_flag;
+
+#if (ENABLE_ALPHA || ENABLE_MULTIVIEW)
+    bool             splitting_flag;
+    int              m_scalabilityMaskMAX_VPS_NUM_SCALABILITY_TYPES;
+    int              scalabilityTypes;
+    uint8_t          m_dimensionIdLenMAX_VPS_NUM_SCALABILITY_TYPES;
+    uint8_t          m_dimensionIdMAX_VPS_LAYER_ID_PLUS1MAX_VPS_NUM_SCALABILITY_TYPES;
+    bool             m_nuhLayerIdPresentFlag;
+    uint8_t          m_layerIdInNuhMAX_VPS_LAYER_ID_PLUS1;
+    uint8_t          m_layerIdInVpsMAX_VPS_LAYER_ID_PLUS1;
+    int              m_viewIdLen;
+    int              m_vpsNumLayerSetsMinus1;
+    int              m_numLayersInIdList1023;
+#endif
+
+#if ENABLE_MULTIVIEW
+    int              m_layerIdIncludedFlag;
+#endif
 };
 
 struct Window
@@ -252,6 +277,13 @@
 
     Window   conformanceWindow;
     VUI      vuiParameters;
+    bool     sps_extension_flag;
+
+#if ENABLE_MULTIVIEW
+    int      setSpsExtOrMaxSubLayersMinus1;
+    int      maxViews;
+    bool     vui_parameters_present_flag;
+#endif
 
     SPS()
     {
@@ -290,6 +322,11 @@
 
     int      numRefIdxDefault2;
     bool     pps_slice_chroma_qp_offsets_present_flag;
+
+    bool     pps_extension_flag;
+    int      maxViews;
+
+    int      profileIdc;
 };
 
 struct WeightParam
@@ -339,6 +376,7 @@
 
     NalUnitType m_nalUnitType;
     SliceType   m_sliceType;
+    SliceType   m_origSliceType;
     int         m_sliceQp;
     int         m_chromaQpOffset2;
     int         m_poc;
@@ -365,6 +403,13 @@
     int         m_fieldNum;
     Frame*      m_mcstfRefFrameList2MAX_MCSTF_TEMPORAL_WINDOW_LENGTH;
 
+#if  ENABLE_SCC_EXT
+    Frame*      m_lastEncPic;
+    bool        m_bLMvdL1Zero;
+    bool        m_useIntegerMv;
+#endif
+    bool        m_bTemporalMvp;
+
     Slice()
     {
         m_lastIDR = 0;
@@ -380,11 +425,23 @@
         m_rpsIdx = -1;
         m_chromaQpOffset0 = m_chromaQpOffset1 = 0;
         m_fieldNum = 0;
+#if  ENABLE_SCC_EXT
+        m_lastEncPic = NULL;
+        m_useIntegerMv = false;
+#endif
+        m_bTemporalMvp = false;
     }
 
     void disableWeights();
 
-    void setRefPicList(PicList& picList);
+    void setRefPicList(PicList& picList, PicList& refPicSetInterLayer0, PicList& refPicSetInterLayer1, int viewId);
+#if ENABLE_MULTIVIEW
+    void createInterLayerReferencePictureSet(PicList& picList, PicList& refPicSetInterLayer0, PicList& refPicSetInterLayer1);
+#endif
+
+#if  ENABLE_SCC_EXT
+    bool isOnlyCurrentPictureAsReference() const;
+#endif
 
     bool getRapPicFlag() const
     {

 
@@ -73,7 +73,11 @@
         MAIN10 = 2,
         MAINSTILLPICTURE = 3,
         MAINREXT = 4,
-        HIGHTHROUGHPUTREXT = 5
+        HIGHTHROUGHPUTREXT = 5,
+        MULTIVIEWMAIN = 6,
+        SCALABLEMAIN = 7,
+        SCALABLEMAIN10 = 8,
+        MAINSCC = 9
     };
 }
 
@@ -106,7 +110,7 @@
 
 struct ProfileTierLevel
 {
-    int      profileIdc;
+    int      profileIdcMAX_LAYERS;
     int      levelIdc;
     uint32_t minCrForLevel;
     uint32_t maxLumaSrForLevel;
@@ -159,6 +163,27 @@
     uint32_t         numReorderPicsMAX_T_LAYERS;
     uint32_t         maxDecPicBufferingMAX_T_LAYERS;
     uint32_t         maxLatencyIncreaseMAX_T_LAYERS;
+    int              m_numLayers;
+    int              m_numViews;
+    bool             vps_extension_flag;
+
+#if (ENABLE_ALPHA || ENABLE_MULTIVIEW)
+    bool             splitting_flag;
+    int              m_scalabilityMaskMAX_VPS_NUM_SCALABILITY_TYPES;
+    int              scalabilityTypes;
+    uint8_t          m_dimensionIdLenMAX_VPS_NUM_SCALABILITY_TYPES;
+    uint8_t          m_dimensionIdMAX_VPS_LAYER_ID_PLUS1MAX_VPS_NUM_SCALABILITY_TYPES;
+    bool             m_nuhLayerIdPresentFlag;
+    uint8_t          m_layerIdInNuhMAX_VPS_LAYER_ID_PLUS1;
+    uint8_t          m_layerIdInVpsMAX_VPS_LAYER_ID_PLUS1;
+    int              m_viewIdLen;
+    int              m_vpsNumLayerSetsMinus1;
+    int              m_numLayersInIdList1023;
+#endif
+
+#if ENABLE_MULTIVIEW
+    int              m_layerIdIncludedFlag;
+#endif
 };
 
 struct Window
@@ -252,6 +277,13 @@
 
     Window   conformanceWindow;
     VUI      vuiParameters;
+    bool     sps_extension_flag;
+
+#if ENABLE_MULTIVIEW
+    int      setSpsExtOrMaxSubLayersMinus1;
+    int      maxViews;
+    bool     vui_parameters_present_flag;
+#endif
 
     SPS()
     {
@@ -290,6 +322,11 @@
 
     int      numRefIdxDefault2;
     bool     pps_slice_chroma_qp_offsets_present_flag;
+
+    bool     pps_extension_flag;
+    int      maxViews;
+
+    int      profileIdc;
 };
 
 struct WeightParam
@@ -339,6 +376,7 @@
 
     NalUnitType m_nalUnitType;
     SliceType   m_sliceType;
+    SliceType   m_origSliceType;
     int         m_sliceQp;
     int         m_chromaQpOffset2;
     int         m_poc;
@@ -365,6 +403,13 @@
     int         m_fieldNum;
     Frame*      m_mcstfRefFrameList2MAX_MCSTF_TEMPORAL_WINDOW_LENGTH;
 
+#if  ENABLE_SCC_EXT
+    Frame*      m_lastEncPic;
+    bool        m_bLMvdL1Zero;
+    bool        m_useIntegerMv;
+#endif
+    bool        m_bTemporalMvp;
+
     Slice()
     {
         m_lastIDR = 0;
@@ -380,11 +425,23 @@
         m_rpsIdx = -1;
         m_chromaQpOffset0 = m_chromaQpOffset1 = 0;
         m_fieldNum = 0;
+#if  ENABLE_SCC_EXT
+        m_lastEncPic = NULL;
+        m_useIntegerMv = false;
+#endif
+        m_bTemporalMvp = false;
     }
 
     void disableWeights();
 
-    void setRefPicList(PicList& picList);
+    void setRefPicList(PicList& picList, PicList& refPicSetInterLayer0, PicList& refPicSetInterLayer1, int viewId);
+#if ENABLE_MULTIVIEW
+    void createInterLayerReferencePictureSet(PicList& picList, PicList& refPicSetInterLayer0, PicList& refPicSetInterLayer1);
+#endif
+
+#if  ENABLE_SCC_EXT
+    bool isOnlyCurrentPictureAsReference() const;
+#endif
 
     bool getRapPicFlag() const
     {
​

x265_3.6.tar.gz/source/common/threadpool.cpp -> x265_4.0.tar.gz/source/common/threadpool.cpp Changed

 
@@ -669,7 +669,11 @@
     else if (cpuCount >= 16)
         p->frameNumThreads = 4; 
     else if (cpuCount >= 8)
+#if _WIN32 && X265_ARCH_ARM64
+        p->frameNumThreads = cpuCount;
+#else
         p->frameNumThreads = 3;
+#endif
     else if (cpuCount >= 4)
         p->frameNumThreads = 2;
     else
​

x265_3.6.tar.gz/source/common/vec/vec-primitives.cpp -> x265_4.0.tar.gz/source/common/vec/vec-primitives.cpp Changed

 
@@ -59,7 +59,7 @@
 void setupIntrinsicDCT_sse41(EncoderPrimitives&);
 
 /* Use primitives for the best available vector architecture */
-void setupInstrinsicPrimitives(EncoderPrimitives &p, int cpuMask)
+void setupIntrinsicPrimitives(EncoderPrimitives &p, int cpuMask)
 {
 #ifdef HAVE_SSE3
     if (cpuMask & X265_CPU_SSE3)
​

x265_3.6.tar.gz/source/common/wavefront.cpp -> x265_4.0.tar.gz/source/common/wavefront.cpp Changed

 
@@ -58,6 +58,11 @@
     x265_free((void*)m_externalDependencyBitmap);
 }
 
+void WaveFront::setLayerId(int layer)
+{
+    m_sLayerId = layer;
+}
+
 void WaveFront::clearEnabledRowMask()
 {
     memset((void*)m_externalDependencyBitmap, 0, sizeof(uint32_t) * m_numWords);
@@ -103,7 +108,7 @@
             if (ATOMIC_AND(&m_internalDependencyBitmapw, ~bit) & bit)
             {
                 /* we cleared the bit, we get to process the row */
-                processRow(w * 32 + id, threadId);
+                processRow(w * 32 + id, threadId, m_sLayerId);
                 m_helpWanted = true;
                 return; /* check for a higher priority task */
             }
​

x265_3.6.tar.gz/source/common/wavefront.h -> x265_4.0.tar.gz/source/common/wavefront.h Changed

 
@@ -52,6 +52,8 @@
 
     int m_numRows;
 
+    int m_sLayerId;
+
 protected:
     uint32_t *m_row_to_idx;
     uint32_t *m_idx_to_row;
@@ -95,7 +97,9 @@
 
     // Start or resume encode processing of this row, must be implemented by
     // derived classes.
-    virtual void processRow(int row, int threadId) = 0;
+    virtual void processRow(int row, int threadId, int layer) = 0;
+
+    void setLayerId(int layer);
 };
 } // end namespace X265_NS
 
​

x265_3.6.tar.gz/source/encoder/analysis.cpp -> x265_4.0.tar.gz/source/encoder/analysis.cpp Changed

@@ -223,7 +223,12 @@
     }
     ProfileCUScope(ctu, totalCTUTime, totalCTUs);
 
-    if (m_slice->m_sliceType == I_SLICE)
+#if  ENABLE_SCC_EXT
+    memset(m_ibc.m_BVs, 0, sizeof(m_ibc.m_BVs));
+    memset(m_ibc.m_lastIntraBCMv, 0, sizeof(m_ibc.m_lastIntraBCMv));
+    m_ibc.m_numBV16s = 0; m_ibc.m_numBVs = 0;
+#endif
+    if (m_slice->m_sliceType == I_SLICE || (m_param->bEnableSCC && (m_slice->m_numRefIdx0 == 1) && m_slice->m_refPOCList00 == m_slice->m_poc))
     {
         x265_analysis_intra_data* intraDataCTU = m_frame->m_analysisData.intraData;
         if (m_param->analysisLoadReuseLevel > 1)
@@ -233,7 +238,11 @@
             memcpy(ctu.m_partSize, &intraDataCTU->partSizesctu.m_cuAddr * numPartition, sizeof(char) * numPartition);
             memcpy(ctu.m_chromaIntraDir, &intraDataCTU->chromaModesctu.m_cuAddr * numPartition, sizeof(uint8_t) * numPartition);
         }
+#if ENABLE_SCC_EXT
+        compressIntraCU(ctu, cuGeom, qp, &m_ibc);
+#else
         compressIntraCU(ctu, cuGeom, qp);
+#endif
     }
     else
     {
@@ -271,7 +280,7 @@
         {
             /* In RD Level 0/1, copy source pixels into the reconstructed block so
              * they are available for intra predictions */
-            m_modeDepth0.fencYuv.copyToPicYuv(*m_frame->m_reconPic, ctu.m_cuAddr, 0);
+            m_modeDepth0.fencYuv.copyToPicYuv(*m_frame->m_reconPic0, ctu.m_cuAddr, 0);
 
             compressInterCU_rd0_4(ctu, cuGeom, qp);
 
@@ -304,7 +313,11 @@
         else if (m_param->rdLevel <= 4)
             compressInterCU_rd0_4(ctu, cuGeom, qp);
         else
+#if ENABLE_SCC_EXT
+            compressInterCU_rd5_6(ctu, cuGeom, qp, &m_ibc);
+#else
             compressInterCU_rd5_6(ctu, cuGeom, qp);
+#endif
     }
 
     if (m_param->bEnableRdRefine || m_param->bOptCUDeltaQP)
@@ -508,15 +521,22 @@
 
     /* Copy best data to encData CTU and recon */
     md.bestMode->cu.copyToPic(depth);
-    md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.absPartIdx);
+    md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic0, parentCTU.m_cuAddr, cuGeom.absPartIdx);
 }
 
+#if ENABLE_SCC_EXT
+uint64_t Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, IBC* ibc)
+#else
 uint64_t Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp)
+#endif
 {
     uint32_t depth = cuGeom.depth;
     ModeDepth& md = m_modeDepthdepth;
     md.bestMode = NULL;
 
+    MV iMVCandList410;
+    memset(iMVCandList, 0, sizeof(MV) * 4 * 10);
+
     bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
     bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
 
@@ -567,6 +587,43 @@
             checkBestMode(md.predPRED_INTRA_NxN, depth);
         }
 
+#if ENABLE_SCC_EXT
+        bool intraBlockCopyFastSearch = (m_param->bEnableSCC == 1) ? true : false, bUse1DSearchFor8x8 = false;
+        if (m_param->bEnableSCC)
+        {
+            md.predPRED_MERGE_IBC.cu.initSubCU(parentCTU, cuGeom, qp, ibc->m_lastIntraBCMv);
+            checkRDCostIntraBCMerge2Nx2N(md.predPRED_MERGE_IBC, cuGeom);
+
+            md.predPRED_IBC_2Nx2N.cu.initSubCU(parentCTU, cuGeom, qp, ibc->m_lastIntraBCMv);
+            checkIntraBC_rd5_6(md.predPRED_IBC_2Nx2N, cuGeom, SIZE_2Nx2N, false, bUse1DSearchFor8x8, *ibc);
+            checkBestMode(md.predPRED_IBC_2Nx2N, depth);
+
+            if (intraBlockCopyFastSearch)
+            {
+                if ((int)depth == m_slice->m_sps->log2DiffMaxMinCodingBlockSize)
+                {
+                    md.predPRED_IBC_Nx2N.cu.initSubCU(parentCTU, cuGeom, qp, ibc->m_lastIntraBCMv);
+                    checkIntraBC_rd5_6(md.predPRED_IBC_Nx2N, cuGeom, SIZE_Nx2N, false, bUse1DSearchFor8x8, *ibc, (iMVCandListSIZE_Nx2N + 8));
+                    checkBestMode(md.predPRED_IBC_Nx2N, depth);
+
+                    md.predPRED_IBC_2NxN.cu.initSubCU(parentCTU, cuGeom, qp, ibc->m_lastIntraBCMv);
+                    checkIntraBC_rd5_6(md.predPRED_IBC_2NxN, cuGeom, SIZE_2NxN, false, bUse1DSearchFor8x8, *ibc, (iMVCandListSIZE_2NxN + 8));
+                    checkBestMode(md.predPRED_IBC_2NxN, depth);
+                }
+            }
+            else
+            {
+                md.predPRED_IBC_2NxN.cu.initSubCU(parentCTU, cuGeom, qp);
+                checkIntraBC_rd5_6(md.predPRED_IBC_2NxN, cuGeom, SIZE_2NxN, false, bUse1DSearchFor8x8, *ibc, (iMVCandListSIZE_2NxN + 8));
+                checkBestMode(md.predPRED_IBC_2NxN, depth);
+
+                md.predPRED_IBC_Nx2N.cu.initSubCU(parentCTU, cuGeom, qp);
+                checkIntraBC_rd5_6(md.predPRED_IBC_Nx2N, cuGeom, SIZE_Nx2N, false, bUse1DSearchFor8x8, *ibc, (iMVCandListSIZE_Nx2N + 8));
+                checkBestMode(md.predPRED_IBC_Nx2N, depth);
+            }
+        }
+#endif
+
         if (m_bTryLossless)
             tryLossless(cuGeom);
 
@@ -574,6 +631,91 @@
             addSplitFlagCost(*md.bestMode, cuGeom.depth);
     }
 
+#if ENABLE_SCC_EXT
+    // If Intra BC keep last coded Mv
+    if (md.bestMode && md.bestMode->cu.isInter(0))
+    {
+        MVField mvField;
+        const CUData* cu = &md.bestMode->cu;
+        md.bestMode->cu.getMvField(cu, 0, 0, mvField);
+        int iRefIdxFirst = mvField.refIdx;
+        md.bestMode->cu.getMvField(cu, cu->m_numPartitions - 1, 0, mvField);
+        int iRefIdxLast = mvField.refIdx;
+        bool isIntraBCFirst = (iRefIdxFirst >= 0) ? cu->m_slice->m_refFrameList0iRefIdxFirst->m_poc == cu->m_slice->m_poc : false;
+        bool isIntraBCLast = (iRefIdxLast >= 0) ? cu->m_slice->m_refFrameList0iRefIdxLast->m_poc == cu->m_slice->m_poc : false;
+
+        if (isIntraBCFirst || isIntraBCLast)
+        {
+            if (cu->m_partSize0 == SIZE_2Nx2N)
+            {
+                md.bestMode->cu.getMvField(cu, cu->m_numPartitions - 1, 0, mvField);
+                if (mvField.mv != cu->m_lastIntraBCMv0)
+                {
+                    md.bestMode->cu.m_lastIntraBCMv1 = cu->m_lastIntraBCMv0;
+                    md.bestMode->cu.m_lastIntraBCMv0 = mvField.mv;
+                }
+            }
+            else if (cu->m_partSize0 == SIZE_2NxN || cu->m_partSize0 == SIZE_Nx2N)
+            {
+                // mixed PU, only one partition is IntraBC coded
+                if (isIntraBCFirst != isIntraBCLast)
+                {
+                    if (isIntraBCFirst)
+                    {
+                        // Part 0
+                        md.bestMode->cu.getMvField(cu, 0, 0, mvField);
+                        if (mvField.mv != cu->m_lastIntraBCMv0)
+                        {
+                            md.bestMode->cu.m_lastIntraBCMv1 = cu->m_lastIntraBCMv0;
+                            md.bestMode->cu.m_lastIntraBCMv0 = mvField.mv;
+                        }
+                    }
+                    else if (isIntraBCLast)
+                    {
+                        // Part 1
+                        md.bestMode->cu.getMvField(cu, cu->m_numPartitions - 1, 0, mvField);
+                        if (mvField.mv != cu->m_lastIntraBCMv0)
+                        {
+                            md.bestMode->cu.m_lastIntraBCMv1 = cu->m_lastIntraBCMv0;
+                            md.bestMode->cu.m_lastIntraBCMv0 = mvField.mv;
+                        }
+                    }
+                }
+                else // normal IntraBC CU
+                {
+                    // Part 0
+                    md.bestMode->cu.getMvField(cu, 0, 0, mvField);
+                    if (mvField.mv != cu->m_lastIntraBCMv0)
+                    {
+                        md.bestMode->cu.m_lastIntraBCMv1 = cu->m_lastIntraBCMv0;
+                        md.bestMode->cu.m_lastIntraBCMv0 = mvField.mv;
+                    }
+                    // Part 1
+                    md.bestMode->cu.getMvField(cu, cu->m_numPartitions - 1, 0, mvField);
+                    if (mvField.mv != cu->m_lastIntraBCMv0)
+                    {
+                        md.bestMode->cu.m_lastIntraBCMv1 = cu->m_lastIntraBCMv0;
+                        md.bestMode->cu.m_lastIntraBCMv0 = mvField.mv;
+                    }
+                }
+            }
+            else
+            {
+                // NxN
+                for (int part = 0; part < 4; part++)
+                {
+                    md.bestMode->cu.getMvField(cu, cu->m_numPartitions - 4 + part, 0, mvField);
+                    if (mvField.mv != cu->m_lastIntraBCMv0)
+                    {
+                        md.bestMode->cu.m_lastIntraBCMv1 = cu->m_lastIntraBCMv0;
+                        md.bestMode->cu.m_lastIntraBCMv0 = mvField.mv;
+                    }
+                }
+            }

 
@@ -223,7 +223,12 @@
     }
     ProfileCUScope(ctu, totalCTUTime, totalCTUs);
 
-    if (m_slice->m_sliceType == I_SLICE)
+#if  ENABLE_SCC_EXT
+    memset(m_ibc.m_BVs, 0, sizeof(m_ibc.m_BVs));
+    memset(m_ibc.m_lastIntraBCMv, 0, sizeof(m_ibc.m_lastIntraBCMv));
+    m_ibc.m_numBV16s = 0; m_ibc.m_numBVs = 0;
+#endif
+    if (m_slice->m_sliceType == I_SLICE || (m_param->bEnableSCC && (m_slice->m_numRefIdx0 == 1) && m_slice->m_refPOCList00 == m_slice->m_poc))
     {
         x265_analysis_intra_data* intraDataCTU = m_frame->m_analysisData.intraData;
         if (m_param->analysisLoadReuseLevel > 1)
@@ -233,7 +238,11 @@
             memcpy(ctu.m_partSize, &intraDataCTU->partSizesctu.m_cuAddr * numPartition, sizeof(char) * numPartition);
             memcpy(ctu.m_chromaIntraDir, &intraDataCTU->chromaModesctu.m_cuAddr * numPartition, sizeof(uint8_t) * numPartition);
         }
+#if ENABLE_SCC_EXT
+        compressIntraCU(ctu, cuGeom, qp, &m_ibc);
+#else
         compressIntraCU(ctu, cuGeom, qp);
+#endif
     }
     else
     {
@@ -271,7 +280,7 @@
         {
             /* In RD Level 0/1, copy source pixels into the reconstructed block so
              * they are available for intra predictions */
-            m_modeDepth0.fencYuv.copyToPicYuv(*m_frame->m_reconPic, ctu.m_cuAddr, 0);
+            m_modeDepth0.fencYuv.copyToPicYuv(*m_frame->m_reconPic0, ctu.m_cuAddr, 0);
 
             compressInterCU_rd0_4(ctu, cuGeom, qp);
 
@@ -304,7 +313,11 @@
         else if (m_param->rdLevel <= 4)
             compressInterCU_rd0_4(ctu, cuGeom, qp);
         else
+#if ENABLE_SCC_EXT
+            compressInterCU_rd5_6(ctu, cuGeom, qp, &m_ibc);
+#else
             compressInterCU_rd5_6(ctu, cuGeom, qp);
+#endif
     }
 
     if (m_param->bEnableRdRefine || m_param->bOptCUDeltaQP)
@@ -508,15 +521,22 @@
 
     /* Copy best data to encData CTU and recon */
     md.bestMode->cu.copyToPic(depth);
-    md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.absPartIdx);
+    md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic0, parentCTU.m_cuAddr, cuGeom.absPartIdx);
 }
 
+#if ENABLE_SCC_EXT
+uint64_t Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, IBC* ibc)
+#else
 uint64_t Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp)
+#endif
 {
     uint32_t depth = cuGeom.depth;
     ModeDepth& md = m_modeDepthdepth;
     md.bestMode = NULL;
 
+    MV iMVCandList410;
+    memset(iMVCandList, 0, sizeof(MV) * 4 * 10);
+
     bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
     bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
 
@@ -567,6 +587,43 @@
             checkBestMode(md.predPRED_INTRA_NxN, depth);
         }
 
+#if ENABLE_SCC_EXT
+        bool intraBlockCopyFastSearch = (m_param->bEnableSCC == 1) ? true : false, bUse1DSearchFor8x8 = false;
+        if (m_param->bEnableSCC)
+        {
+            md.predPRED_MERGE_IBC.cu.initSubCU(parentCTU, cuGeom, qp, ibc->m_lastIntraBCMv);
+            checkRDCostIntraBCMerge2Nx2N(md.predPRED_MERGE_IBC, cuGeom);
+
+            md.predPRED_IBC_2Nx2N.cu.initSubCU(parentCTU, cuGeom, qp, ibc->m_lastIntraBCMv);
+            checkIntraBC_rd5_6(md.predPRED_IBC_2Nx2N, cuGeom, SIZE_2Nx2N, false, bUse1DSearchFor8x8, *ibc);
+            checkBestMode(md.predPRED_IBC_2Nx2N, depth);
+
+            if (intraBlockCopyFastSearch)
+            {
+                if ((int)depth == m_slice->m_sps->log2DiffMaxMinCodingBlockSize)
+                {
+                    md.predPRED_IBC_Nx2N.cu.initSubCU(parentCTU, cuGeom, qp, ibc->m_lastIntraBCMv);
+                    checkIntraBC_rd5_6(md.predPRED_IBC_Nx2N, cuGeom, SIZE_Nx2N, false, bUse1DSearchFor8x8, *ibc, (iMVCandListSIZE_Nx2N + 8));
+                    checkBestMode(md.predPRED_IBC_Nx2N, depth);
+
+                    md.predPRED_IBC_2NxN.cu.initSubCU(parentCTU, cuGeom, qp, ibc->m_lastIntraBCMv);
+                    checkIntraBC_rd5_6(md.predPRED_IBC_2NxN, cuGeom, SIZE_2NxN, false, bUse1DSearchFor8x8, *ibc, (iMVCandListSIZE_2NxN + 8));
+                    checkBestMode(md.predPRED_IBC_2NxN, depth);
+                }
+            }
+            else
+            {
+                md.predPRED_IBC_2NxN.cu.initSubCU(parentCTU, cuGeom, qp);
+                checkIntraBC_rd5_6(md.predPRED_IBC_2NxN, cuGeom, SIZE_2NxN, false, bUse1DSearchFor8x8, *ibc, (iMVCandListSIZE_2NxN + 8));
+                checkBestMode(md.predPRED_IBC_2NxN, depth);
+
+                md.predPRED_IBC_Nx2N.cu.initSubCU(parentCTU, cuGeom, qp);
+                checkIntraBC_rd5_6(md.predPRED_IBC_Nx2N, cuGeom, SIZE_Nx2N, false, bUse1DSearchFor8x8, *ibc, (iMVCandListSIZE_Nx2N + 8));
+                checkBestMode(md.predPRED_IBC_Nx2N, depth);
+            }
+        }
+#endif
+
         if (m_bTryLossless)
             tryLossless(cuGeom);
 
@@ -574,6 +631,91 @@
             addSplitFlagCost(*md.bestMode, cuGeom.depth);
     }
 
+#if ENABLE_SCC_EXT
+    // If Intra BC keep last coded Mv
+    if (md.bestMode && md.bestMode->cu.isInter(0))
+    {
+        MVField mvField;
+        const CUData* cu = &md.bestMode->cu;
+        md.bestMode->cu.getMvField(cu, 0, 0, mvField);
+        int iRefIdxFirst = mvField.refIdx;
+        md.bestMode->cu.getMvField(cu, cu->m_numPartitions - 1, 0, mvField);
+        int iRefIdxLast = mvField.refIdx;
+        bool isIntraBCFirst = (iRefIdxFirst >= 0) ? cu->m_slice->m_refFrameList0iRefIdxFirst->m_poc == cu->m_slice->m_poc : false;
+        bool isIntraBCLast = (iRefIdxLast >= 0) ? cu->m_slice->m_refFrameList0iRefIdxLast->m_poc == cu->m_slice->m_poc : false;
+
+        if (isIntraBCFirst || isIntraBCLast)
+        {
+            if (cu->m_partSize0 == SIZE_2Nx2N)
+            {
+                md.bestMode->cu.getMvField(cu, cu->m_numPartitions - 1, 0, mvField);
+                if (mvField.mv != cu->m_lastIntraBCMv0)
+                {
+                    md.bestMode->cu.m_lastIntraBCMv1 = cu->m_lastIntraBCMv0;
+                    md.bestMode->cu.m_lastIntraBCMv0 = mvField.mv;
+                }
+            }
+            else if (cu->m_partSize0 == SIZE_2NxN || cu->m_partSize0 == SIZE_Nx2N)
+            {
+                // mixed PU, only one partition is IntraBC coded
+                if (isIntraBCFirst != isIntraBCLast)
+                {
+                    if (isIntraBCFirst)
+                    {
+                        // Part 0
+                        md.bestMode->cu.getMvField(cu, 0, 0, mvField);
+                        if (mvField.mv != cu->m_lastIntraBCMv0)
+                        {
+                            md.bestMode->cu.m_lastIntraBCMv1 = cu->m_lastIntraBCMv0;
+                            md.bestMode->cu.m_lastIntraBCMv0 = mvField.mv;
+                        }
+                    }
+                    else if (isIntraBCLast)
+                    {
+                        // Part 1
+                        md.bestMode->cu.getMvField(cu, cu->m_numPartitions - 1, 0, mvField);
+                        if (mvField.mv != cu->m_lastIntraBCMv0)
+                        {
+                            md.bestMode->cu.m_lastIntraBCMv1 = cu->m_lastIntraBCMv0;
+                            md.bestMode->cu.m_lastIntraBCMv0 = mvField.mv;
+                        }
+                    }
+                }
+                else // normal IntraBC CU
+                {
+                    // Part 0
+                    md.bestMode->cu.getMvField(cu, 0, 0, mvField);
+                    if (mvField.mv != cu->m_lastIntraBCMv0)
+                    {
+                        md.bestMode->cu.m_lastIntraBCMv1 = cu->m_lastIntraBCMv0;
+                        md.bestMode->cu.m_lastIntraBCMv0 = mvField.mv;
+                    }
+                    // Part 1
+                    md.bestMode->cu.getMvField(cu, cu->m_numPartitions - 1, 0, mvField);
+                    if (mvField.mv != cu->m_lastIntraBCMv0)
+                    {
+                        md.bestMode->cu.m_lastIntraBCMv1 = cu->m_lastIntraBCMv0;
+                        md.bestMode->cu.m_lastIntraBCMv0 = mvField.mv;
+                    }
+                }
+            }
+            else
+            {
+                // NxN
+                for (int part = 0; part < 4; part++)
+                {
+                    md.bestMode->cu.getMvField(cu, cu->m_numPartitions - 4 + part, 0, mvField);
+                    if (mvField.mv != cu->m_lastIntraBCMv0)
+                    {
+                        md.bestMode->cu.m_lastIntraBCMv1 = cu->m_lastIntraBCMv0;
+                        md.bestMode->cu.m_lastIntraBCMv0 = mvField.mv;
+                    }
+                }
+            }
​

x265_3.6.tar.gz/source/encoder/analysis.h -> x265_4.0.tar.gz/source/encoder/analysis.h Changed

@@ -75,6 +75,14 @@
         PRED_nRx2N,
         PRED_INTRA_NxN, /* 4x4 intra PU blocks for 8x8 CU */
         PRED_LOSSLESS,  /* lossless encode of best mode */
+#if ENABLE_SCC_EXT
+        PRED_IBC_2Nx2N,
+        PRED_IBC_Nx2N,
+        PRED_IBC_2NxN,
+        PRED_MIXED_IBC_NX2N,
+        PRED_MIXED_IBC_2NXN,
+        PRED_MERGE_IBC,
+#endif
         MAX_PRED_TYPES
     };
 
@@ -113,6 +121,7 @@
     bool      m_modeFlag2;
     bool      m_checkMergeAndSkipOnly2;
 
+    IBC       m_ibc;
     Analysis();
 
     bool create(ThreadLocalData* tld);
@@ -120,6 +129,7 @@
 
     Mode& compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, const Entropy& initialContext);
     int32_t loadTUDepth(CUGeom cuGeom, CUData parentCTU);
+
 protected:
     /* Analysis data for save/load mode, writes/reads data based on absPartIdx */
     x265_analysis_inter_data*  m_reuseInterDataCTU;
@@ -162,12 +172,20 @@
     void qprdRefine(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, int32_t lqp);
 
     /* full analysis for an I-slice CU */
+#if ENABLE_SCC_EXT
+    uint64_t compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, IBC* ibc = NULL);
+#else
     uint64_t compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);
+#endif
 
     /* full analysis for a P or B slice CU */
     uint32_t compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);
     SplitData compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);
+#if ENABLE_SCC_EXT
+    SplitData compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, IBC* ibc = NULL);
+#else
     SplitData compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);
+#endif
 
     void recodeCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, int32_t origqp = -1);
 
@@ -177,10 +195,15 @@
 
     /* measure inter options */
     void checkInter_rd0_4(Mode& interMode, const CUGeom& cuGeom, PartSize partSize, uint32_t refmask2);
-    void checkInter_rd5_6(Mode& interMode, const CUGeom& cuGeom, PartSize partSize, uint32_t refmask2);
+    void checkInter_rd5_6(Mode& interMode, const CUGeom& cuGeom, PartSize partSize, uint32_t refmask2, MV* iMVCandList = NULL);
 
     void checkBidir2Nx2N(Mode& inter2Nx2N, Mode& bidir2Nx2N, const CUGeom& cuGeom);
 
+#if ENABLE_SCC_EXT
+    void checkRDCostIntraBCMerge2Nx2N(Mode& merge, const CUGeom& cuGeom);
+    void checkIntraBC_rd5_6(Mode& intraBCMode, const CUGeom& cuGeom, PartSize ePartSize, bool testOnlyPred, bool bUse1DSearchFor8x8, IBC& ibc, MV* iMVCandList = NULL);
+#endif
+
     /* encode current bestMode losslessly, pick best RD cost */
     void tryLossless(const CUGeom& cuGeom);

 
@@ -75,6 +75,14 @@
         PRED_nRx2N,
         PRED_INTRA_NxN, /* 4x4 intra PU blocks for 8x8 CU */
         PRED_LOSSLESS,  /* lossless encode of best mode */
+#if ENABLE_SCC_EXT
+        PRED_IBC_2Nx2N,
+        PRED_IBC_Nx2N,
+        PRED_IBC_2NxN,
+        PRED_MIXED_IBC_NX2N,
+        PRED_MIXED_IBC_2NXN,
+        PRED_MERGE_IBC,
+#endif
         MAX_PRED_TYPES
     };
 
@@ -113,6 +121,7 @@
     bool      m_modeFlag2;
     bool      m_checkMergeAndSkipOnly2;
 
+    IBC       m_ibc;
     Analysis();
 
     bool create(ThreadLocalData* tld);
@@ -120,6 +129,7 @@
 
     Mode& compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, const Entropy& initialContext);
     int32_t loadTUDepth(CUGeom cuGeom, CUData parentCTU);
+
 protected:
     /* Analysis data for save/load mode, writes/reads data based on absPartIdx */
     x265_analysis_inter_data*  m_reuseInterDataCTU;
@@ -162,12 +172,20 @@
     void qprdRefine(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, int32_t lqp);
 
     /* full analysis for an I-slice CU */
+#if ENABLE_SCC_EXT
+    uint64_t compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, IBC* ibc = NULL);
+#else
     uint64_t compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);
+#endif
 
     /* full analysis for a P or B slice CU */
     uint32_t compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);
     SplitData compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);
+#if ENABLE_SCC_EXT
+    SplitData compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, IBC* ibc = NULL);
+#else
     SplitData compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);
+#endif
 
     void recodeCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, int32_t origqp = -1);
 
@@ -177,10 +195,15 @@
 
     /* measure inter options */
     void checkInter_rd0_4(Mode& interMode, const CUGeom& cuGeom, PartSize partSize, uint32_t refmask2);
-    void checkInter_rd5_6(Mode& interMode, const CUGeom& cuGeom, PartSize partSize, uint32_t refmask2);
+    void checkInter_rd5_6(Mode& interMode, const CUGeom& cuGeom, PartSize partSize, uint32_t refmask2, MV* iMVCandList = NULL);
 
     void checkBidir2Nx2N(Mode& inter2Nx2N, Mode& bidir2Nx2N, const CUGeom& cuGeom);
 
+#if ENABLE_SCC_EXT
+    void checkRDCostIntraBCMerge2Nx2N(Mode& merge, const CUGeom& cuGeom);
+    void checkIntraBC_rd5_6(Mode& intraBCMode, const CUGeom& cuGeom, PartSize ePartSize, bool testOnlyPred, bool bUse1DSearchFor8x8, IBC& ibc, MV* iMVCandList = NULL);
+#endif
+
     /* encode current bestMode losslessly, pick best RD cost */
     void tryLossless(const CUGeom& cuGeom);
 
​

x265_3.6.tar.gz/source/encoder/api.cpp -> x265_4.0.tar.gz/source/encoder/api.cpp Changed

@@ -20,7 +20,6 @@
  * This program is also available under a commercial proprietary license.
  * For more information, contact us at license @ x265.com.
  *****************************************************************************/
-
 #include "common.h"
 #include "bitstream.h"
 #include "param.h"
@@ -185,7 +184,7 @@
     // will detect and set profile/tier/level in VPS
     determineLevel(*param, encoder->m_vps);
 
-    if (!param->bAllowNonConformance && encoder->m_vps.ptl.profileIdc == Profile::NONE)
+    if (!param->bAllowNonConformance && encoder->m_vps.ptl.profileIdc0 == Profile::NONE)
     {
         x265_log(param, X265_LOG_INFO, "non-conformant bitstreams not allowed (--allow-non-conformance)\n");
         goto fail;
@@ -357,11 +356,11 @@
             VPS saveVPS;
             memcpy(&saveVPS.ptl, &encoder->m_vps.ptl, sizeof(saveVPS.ptl));
             determineLevel(*encoder->m_latestParam, encoder->m_vps);
-            if (saveVPS.ptl.profileIdc != encoder->m_vps.ptl.profileIdc || saveVPS.ptl.levelIdc != encoder->m_vps.ptl.levelIdc
+            if (saveVPS.ptl.profileIdc0 != encoder->m_vps.ptl.profileIdc0 || saveVPS.ptl.levelIdc != encoder->m_vps.ptl.levelIdc
                 || saveVPS.ptl.tierFlag != encoder->m_vps.ptl.tierFlag)
             {
                 x265_log(encoder->m_param, X265_LOG_WARNING, "Profile/Level/Tier has changed from %d/%d/%s to %d/%d/%s.Cannot reconfigure rate-control.\n",
-                         saveVPS.ptl.profileIdc, saveVPS.ptl.levelIdc, saveVPS.ptl.tierFlag ? "High" : "Main", encoder->m_vps.ptl.profileIdc,
+                         saveVPS.ptl.profileIdc0, saveVPS.ptl.levelIdc, saveVPS.ptl.tierFlag ? "High" : "Main", encoder->m_vps.ptl.profileIdc0,
                          encoder->m_vps.ptl.levelIdc, encoder->m_vps.ptl.tierFlag ? "High" : "Main");
                 x265_copy_params(encoder->m_latestParam, &save);
                 memcpy(&encoder->m_vps.ptl, &saveVPS.ptl, sizeof(saveVPS.ptl));
@@ -406,7 +405,7 @@
     return 0;
 }
 
-int x265_encoder_encode(x265_encoder *enc, x265_nal **pp_nal, uint32_t *pi_nal, x265_picture *pic_in, x265_picture *pic_out)
+int x265_encoder_encode(x265_encoder *enc, x265_nal **pp_nal, uint32_t *pi_nal, x265_picture *pic_in, x265_picture **pic_out)
 {
     if (!enc)
         return -1;
@@ -602,7 +601,10 @@
         *pi_nal = 0;
 
     if (numEncoded && encoder->m_param->csvLogLevel && encoder->m_outputCount >= encoder->m_latestParam->chunkStart)
-        x265_csvlog_frame(encoder->m_param, pic_out);
+    {
+        for (int layer = 0; layer < encoder->m_param->numLayers; layer++)
+            x265_csvlog_frame(encoder->m_param, pic_outlayer);
+    }
 
     if (numEncoded < 0)
         encoder->m_aborted = true;
@@ -653,11 +655,14 @@
     if (enc)
     {
         Encoder *encoder = static_cast<Encoder*>(enc);
-        x265_stats stats;       
-        encoder->fetchStats(&stats, sizeof(stats));
+        x265_stats statsMAX_LAYERS;
         int padx = encoder->m_sps.conformanceWindow.rightOffset;
         int pady = encoder->m_sps.conformanceWindow.bottomOffset;
-        x265_csvlog_encode(encoder->m_param, &stats, padx, pady, argc, argv);
+        for (int layer = 0; layer < encoder->m_param->numLayers; layer++)
+        {
+            encoder->fetchStats(stats, sizeof(statslayer), layer);
+            x265_csvlog_encode(encoder->m_param, &stats0, padx, pady, argc, argv);
+        }
     }
 }
 
@@ -744,7 +749,7 @@
     if (!enc)
         return -1;
     Encoder *encoder = static_cast<Encoder*>(enc);
-    if (!encoder->copySlicetypePocAndSceneCut(slicetype, poc, sceneCut))
+    if (!encoder->copySlicetypePocAndSceneCut(slicetype, poc, sceneCut, 0))
         return 0;
     return -1;
 }
@@ -1295,7 +1300,7 @@
         {
             if (param->csvLogLevel)
             {
-                fprintf(csvfp, "Encode Order, Type, POC, QP, Bits, Scenecut, ");
+                fprintf(csvfp, "Layer , Encode Order, Type, POC, QP, Bits, Scenecut, ");
                 if (!!param->bEnableTemporalSubLayers)
                     fprintf(csvfp, "Temporal Sub Layer ID, ");
                 if (param->csvLogLevel >= 2)
@@ -1409,7 +1414,7 @@
         return;
 
     const x265_frame_stats* frameStats = &pic->frameData;
-    fprintf(param->csvfpt, "%d, %c-SLICE, %4d, %2.2lf, %10d, %d,", frameStats->encoderOrder, frameStats->sliceType, frameStats->poc,
+    fprintf(param->csvfpt, "%d, %d, %c-SLICE, %4d, %2.2lf, %10d, %d,", pic->layerID, frameStats->encoderOrder, frameStats->sliceType, frameStats->poc,
                                                                    frameStats->qp, (int)frameStats->bits, frameStats->bScenecut);
     if (!!param->bEnableTemporalSubLayers)
         fprintf(param->csvfpt, "%d,", frameStats->tLayer);
@@ -1806,6 +1811,219 @@
     return ret;
 }
 
+static enum VmafOutputFormat log_fmt_map(const char *log_fmt)
+{
+	if (log_fmt) {
+		if (!strcmp(log_fmt, "xml"))
+			return VMAF_OUTPUT_FORMAT_XML;
+		if (!strcmp(log_fmt, "json"))
+			return VMAF_OUTPUT_FORMAT_JSON;
+		if (!strcmp(log_fmt, "csv"))
+			return VMAF_OUTPUT_FORMAT_CSV;
+		if (!strcmp(log_fmt, "sub"))
+			return VMAF_OUTPUT_FORMAT_SUB;
+	}
+
+	return VMAF_OUTPUT_FORMAT_NONE;
+}
+
+static enum VmafPoolingMethod pool_method_map(const char *pool_method)
+{
+	if (pool_method) {
+		if (!strcmp(pool_method, "min"))
+			return VMAF_POOL_METHOD_MIN;
+		if (!strcmp(pool_method, "mean"))
+			return VMAF_POOL_METHOD_MEAN;
+		if (!strcmp(pool_method, "harmonic_mean"))
+			return VMAF_POOL_METHOD_HARMONIC_MEAN;
+	}
+	return VMAF_POOL_METHOD_MEAN;
+}
+
+static enum VmafPixelFormat pix_fmt_map(const char *fmt)
+{
+	if (fmt) {
+		if (!strcmp(fmt, "yuv420p") || !strcmp(fmt, "yuv420p10le") || !strcmp(fmt, "yuv420p12le") || !strcmp(fmt, "yuv420p16le"))
+            return VMAF_PIX_FMT_YUV420P;
+        if (!strcmp(fmt, "yuv422p") || !strcmp(fmt, "yuv422p10le"))
+            return VMAF_PIX_FMT_YUV422P;
+        if (!strcmp(fmt, "yuv444p") || !strcmp(fmt, "yuv444p10le"))
+            return VMAF_PIX_FMT_YUV444P;
+	}
+	return VMAF_PIX_FMT_UNKNOWN;
+}
+
+static void copy_picture(float *src, VmafPicture *dst, unsigned width, unsigned height, int src_stride, unsigned bpc)
+{
+    const int bytes_per_value = bpc > 8 ? 2 : 1;
+    const int dst_stride = dst->stride0 / bytes_per_value;
+    const unsigned b_shift = (bpc > 8) ? (bpc - 8) : 0;
+
+    uint8_t *dst_data = static_cast<uint8_t*>(dst->data0);
+
+    for (unsigned i = 0; i < height; i++) {
+        if (bpc > 8) {
+            uint16_t *dst_row = reinterpret_cast<uint16_t*>(dst_data);
+            for (unsigned j = 0; j < width; j++) {
+                dst_rowj = static_cast<uint16_t>(srcj * (1 << b_shift));
+            }
+        } else {
+            for (unsigned j = 0; j < width; j++) {
+                dst_dataj = static_cast<uint8_t>(srcj);
+            }
+        }
+        src += src_stride / sizeof(float);
+        dst_data += dst_stride * bytes_per_value;
+    }
+}
+
+int load_feature(VmafContext *vmaf, const char *feature_name, VmafFeatureDictionary *d) {
+    int err = vmaf_use_feature(vmaf, feature_name, d);
+    if (err) {
+        printf("problem loading feature extractor: %s\n", feature_name);
+    }
+    return err;
+}
+
+int compute_vmaf(double* vmaf_score, char* fmt, int width, int height, int bitdepth, int(*read_frame)(float *ref_data, float *main_data, float *temp_data, int stride_byte, void *user_data),
+	void *user_data, char *model_path, char *log_path, char *log_fmt, int disable_clip, int disable_avx, int enable_transform, int phone_model, int do_psnr, int do_ssim, int do_ms_ssim,
+	char *pool_method, int n_thread, int n_subsample)
+{
+	int err = 0;
+
+	VmafConfiguration cfg = {
+		.log_level = VMAF_LOG_LEVEL_INFO,
+		.n_threads = n_thread,
+		.n_subsample = n_subsample,
+		.cpumask = disable_avx ? -1 : 0,
+		.gpumask = 0,
+	};
+
+	VmafContext *vmaf;
+	err = vmaf_init(&vmaf, cfg);
+	if (err) {
+		printf("problem initializing VMAF context\n");
+		return -1;
+	}
+
+	uint64_t flags = VMAF_MODEL_FLAGS_DEFAULT;
+	if (disable_clip)
+		flags |= VMAF_MODEL_FLAG_DISABLE_CLIP;
+	if (enable_transform || phone_model)

 
@@ -20,7 +20,6 @@
  * This program is also available under a commercial proprietary license.
  * For more information, contact us at license @ x265.com.
  *****************************************************************************/
-
 #include "common.h"
 #include "bitstream.h"
 #include "param.h"
@@ -185,7 +184,7 @@
     // will detect and set profile/tier/level in VPS
     determineLevel(*param, encoder->m_vps);
 
-    if (!param->bAllowNonConformance && encoder->m_vps.ptl.profileIdc == Profile::NONE)
+    if (!param->bAllowNonConformance && encoder->m_vps.ptl.profileIdc0 == Profile::NONE)
     {
         x265_log(param, X265_LOG_INFO, "non-conformant bitstreams not allowed (--allow-non-conformance)\n");
         goto fail;
@@ -357,11 +356,11 @@
             VPS saveVPS;
             memcpy(&saveVPS.ptl, &encoder->m_vps.ptl, sizeof(saveVPS.ptl));
             determineLevel(*encoder->m_latestParam, encoder->m_vps);
-            if (saveVPS.ptl.profileIdc != encoder->m_vps.ptl.profileIdc || saveVPS.ptl.levelIdc != encoder->m_vps.ptl.levelIdc
+            if (saveVPS.ptl.profileIdc0 != encoder->m_vps.ptl.profileIdc0 || saveVPS.ptl.levelIdc != encoder->m_vps.ptl.levelIdc
                 || saveVPS.ptl.tierFlag != encoder->m_vps.ptl.tierFlag)
             {
                 x265_log(encoder->m_param, X265_LOG_WARNING, "Profile/Level/Tier has changed from %d/%d/%s to %d/%d/%s.Cannot reconfigure rate-control.\n",
-                         saveVPS.ptl.profileIdc, saveVPS.ptl.levelIdc, saveVPS.ptl.tierFlag ? "High" : "Main", encoder->m_vps.ptl.profileIdc,
+                         saveVPS.ptl.profileIdc0, saveVPS.ptl.levelIdc, saveVPS.ptl.tierFlag ? "High" : "Main", encoder->m_vps.ptl.profileIdc0,
                          encoder->m_vps.ptl.levelIdc, encoder->m_vps.ptl.tierFlag ? "High" : "Main");
                 x265_copy_params(encoder->m_latestParam, &save);
                 memcpy(&encoder->m_vps.ptl, &saveVPS.ptl, sizeof(saveVPS.ptl));
@@ -406,7 +405,7 @@
     return 0;
 }
 
-int x265_encoder_encode(x265_encoder *enc, x265_nal **pp_nal, uint32_t *pi_nal, x265_picture *pic_in, x265_picture *pic_out)
+int x265_encoder_encode(x265_encoder *enc, x265_nal **pp_nal, uint32_t *pi_nal, x265_picture *pic_in, x265_picture **pic_out)
 {
     if (!enc)
         return -1;
@@ -602,7 +601,10 @@
         *pi_nal = 0;
 
     if (numEncoded && encoder->m_param->csvLogLevel && encoder->m_outputCount >= encoder->m_latestParam->chunkStart)
-        x265_csvlog_frame(encoder->m_param, pic_out);
+    {
+        for (int layer = 0; layer < encoder->m_param->numLayers; layer++)
+            x265_csvlog_frame(encoder->m_param, pic_outlayer);
+    }
 
     if (numEncoded < 0)
         encoder->m_aborted = true;
@@ -653,11 +655,14 @@
     if (enc)
     {
         Encoder *encoder = static_cast<Encoder*>(enc);
-        x265_stats stats;       
-        encoder->fetchStats(&stats, sizeof(stats));
+        x265_stats statsMAX_LAYERS;
         int padx = encoder->m_sps.conformanceWindow.rightOffset;
         int pady = encoder->m_sps.conformanceWindow.bottomOffset;
-        x265_csvlog_encode(encoder->m_param, &stats, padx, pady, argc, argv);
+        for (int layer = 0; layer < encoder->m_param->numLayers; layer++)
+        {
+            encoder->fetchStats(stats, sizeof(statslayer), layer);
+            x265_csvlog_encode(encoder->m_param, &stats0, padx, pady, argc, argv);
+        }
     }
 }
 
@@ -744,7 +749,7 @@
     if (!enc)
         return -1;
     Encoder *encoder = static_cast<Encoder*>(enc);
-    if (!encoder->copySlicetypePocAndSceneCut(slicetype, poc, sceneCut))
+    if (!encoder->copySlicetypePocAndSceneCut(slicetype, poc, sceneCut, 0))
         return 0;
     return -1;
 }
@@ -1295,7 +1300,7 @@
         {
             if (param->csvLogLevel)
             {
-                fprintf(csvfp, "Encode Order, Type, POC, QP, Bits, Scenecut, ");
+                fprintf(csvfp, "Layer , Encode Order, Type, POC, QP, Bits, Scenecut, ");
                 if (!!param->bEnableTemporalSubLayers)
                     fprintf(csvfp, "Temporal Sub Layer ID, ");
                 if (param->csvLogLevel >= 2)
@@ -1409,7 +1414,7 @@
         return;
 
     const x265_frame_stats* frameStats = &pic->frameData;
-    fprintf(param->csvfpt, "%d, %c-SLICE, %4d, %2.2lf, %10d, %d,", frameStats->encoderOrder, frameStats->sliceType, frameStats->poc,
+    fprintf(param->csvfpt, "%d, %d, %c-SLICE, %4d, %2.2lf, %10d, %d,", pic->layerID, frameStats->encoderOrder, frameStats->sliceType, frameStats->poc,
                                                                    frameStats->qp, (int)frameStats->bits, frameStats->bScenecut);
     if (!!param->bEnableTemporalSubLayers)
         fprintf(param->csvfpt, "%d,", frameStats->tLayer);
@@ -1806,6 +1811,219 @@
     return ret;
 }
 
+static enum VmafOutputFormat log_fmt_map(const char *log_fmt)
+{
+   if (log_fmt) {
+       if (!strcmp(log_fmt, "xml"))
+           return VMAF_OUTPUT_FORMAT_XML;
+       if (!strcmp(log_fmt, "json"))
+           return VMAF_OUTPUT_FORMAT_JSON;
+       if (!strcmp(log_fmt, "csv"))
+           return VMAF_OUTPUT_FORMAT_CSV;
+       if (!strcmp(log_fmt, "sub"))
+           return VMAF_OUTPUT_FORMAT_SUB;
+   }
+
+   return VMAF_OUTPUT_FORMAT_NONE;
+}
+
+static enum VmafPoolingMethod pool_method_map(const char *pool_method)
+{
+   if (pool_method) {
+       if (!strcmp(pool_method, "min"))
+           return VMAF_POOL_METHOD_MIN;
+       if (!strcmp(pool_method, "mean"))
+           return VMAF_POOL_METHOD_MEAN;
+       if (!strcmp(pool_method, "harmonic_mean"))
+           return VMAF_POOL_METHOD_HARMONIC_MEAN;
+   }
+   return VMAF_POOL_METHOD_MEAN;
+}
+
+static enum VmafPixelFormat pix_fmt_map(const char *fmt)
+{
+   if (fmt) {
+       if (!strcmp(fmt, "yuv420p") || !strcmp(fmt, "yuv420p10le") || !strcmp(fmt, "yuv420p12le") || !strcmp(fmt, "yuv420p16le"))
+            return VMAF_PIX_FMT_YUV420P;
+        if (!strcmp(fmt, "yuv422p") || !strcmp(fmt, "yuv422p10le"))
+            return VMAF_PIX_FMT_YUV422P;
+        if (!strcmp(fmt, "yuv444p") || !strcmp(fmt, "yuv444p10le"))
+            return VMAF_PIX_FMT_YUV444P;
+   }
+   return VMAF_PIX_FMT_UNKNOWN;
+}
+
+static void copy_picture(float *src, VmafPicture *dst, unsigned width, unsigned height, int src_stride, unsigned bpc)
+{
+    const int bytes_per_value = bpc > 8 ? 2 : 1;
+    const int dst_stride = dst->stride0 / bytes_per_value;
+    const unsigned b_shift = (bpc > 8) ? (bpc - 8) : 0;
+
+    uint8_t *dst_data = static_cast<uint8_t*>(dst->data0);
+
+    for (unsigned i = 0; i < height; i++) {
+        if (bpc > 8) {
+            uint16_t *dst_row = reinterpret_cast<uint16_t*>(dst_data);
+            for (unsigned j = 0; j < width; j++) {
+                dst_rowj = static_cast<uint16_t>(srcj * (1 << b_shift));
+            }
+        } else {
+            for (unsigned j = 0; j < width; j++) {
+                dst_dataj = static_cast<uint8_t>(srcj);
+            }
+        }
+        src += src_stride / sizeof(float);
+        dst_data += dst_stride * bytes_per_value;
+    }
+}
+
+int load_feature(VmafContext *vmaf, const char *feature_name, VmafFeatureDictionary *d) {
+    int err = vmaf_use_feature(vmaf, feature_name, d);
+    if (err) {
+        printf("problem loading feature extractor: %s\n", feature_name);
+    }
+    return err;
+}
+
+int compute_vmaf(double* vmaf_score, char* fmt, int width, int height, int bitdepth, int(*read_frame)(float *ref_data, float *main_data, float *temp_data, int stride_byte, void *user_data),
+   void *user_data, char *model_path, char *log_path, char *log_fmt, int disable_clip, int disable_avx, int enable_transform, int phone_model, int do_psnr, int do_ssim, int do_ms_ssim,
+   char *pool_method, int n_thread, int n_subsample)
+{
+   int err = 0;
+
+   VmafConfiguration cfg = {
+       .log_level = VMAF_LOG_LEVEL_INFO,
+       .n_threads = n_thread,
+       .n_subsample = n_subsample,
+       .cpumask = disable_avx ? -1 : 0,
+       .gpumask = 0,
+   };
+
+   VmafContext *vmaf;
+   err = vmaf_init(&vmaf, cfg);
+   if (err) {
+       printf("problem initializing VMAF context\n");
+       return -1;
+   }
+
+   uint64_t flags = VMAF_MODEL_FLAGS_DEFAULT;
+   if (disable_clip)
+       flags |= VMAF_MODEL_FLAG_DISABLE_CLIP;
+   if (enable_transform || phone_model)
​

x265_3.6.tar.gz/source/encoder/dpb.cpp -> x265_4.0.tar.gz/source/encoder/dpb.cpp Changed

@@ -53,8 +53,8 @@
         FrameData* next = m_frameDataFreeList->m_freeListNext;
         m_frameDataFreeList->destroy();
 
-        m_frameDataFreeList->m_reconPic->destroy();
-        delete m_frameDataFreeList->m_reconPic;
+        m_frameDataFreeList->m_reconPic0->destroy();
+        delete m_frameDataFreeList->m_reconPic0;
 
         delete m_frameDataFreeList;
         m_frameDataFreeList = next;
@@ -75,7 +75,7 @@
         if (curFrame->m_param->bEnableTemporalFilter)
             isMCSTFReferenced =!!(curFrame->m_refPicCnt1);
 
-        if (!curFrame->m_encData->m_bHasReferences && !curFrame->m_countRefEncoders && !isMCSTFReferenced)
+        if (curFrame->m_valid && !curFrame->m_encData->m_bHasReferences && !curFrame->m_countRefEncoders && !isMCSTFReferenced)
         {
             curFrame->m_bChromaExtended = false;
 
@@ -95,6 +95,12 @@
 
             // iterator is invalidated by remove, restart scan
             m_picList.remove(*curFrame);
+#if ENABLE_MULTIVIEW
+            if (curFrame->m_param->numViews > 1 && !curFrame->m_viewId && m_picList.getPOC(curFrame->m_poc, 1) && curFrame == m_picList.getPOC(curFrame->m_poc, 1)->refPicSetInterLayer0.getPOC(curFrame->m_poc, curFrame->m_viewId))
+            {
+                m_picList.getPOC(curFrame->m_poc, 1)->refPicSetInterLayer0.removeSubDPB(*curFrame);
+            }
+#endif
             iterFrame = m_picList.first();
 
             m_freeList.pushBack(*curFrame);
@@ -126,7 +132,8 @@
                 curFrame->m_prevCtuInfoChange = NULL;
             }
             curFrame->m_encData = NULL;
-            curFrame->m_reconPic = NULL;
+            for (int i = 0; i < !!curFrame->m_param->bEnableSCC + 1; i++)
+                curFrame->m_reconPici = NULL;
         }
     }
 }
@@ -145,6 +152,11 @@
         m_lastIDR = pocCurr;
     slice->m_lastIDR = m_lastIDR;
     slice->m_sliceType = IS_X265_TYPE_B(type) ? B_SLICE : (type == X265_TYPE_P) ? P_SLICE : I_SLICE;
+#if ENABLE_SCC_EXT
+    if (slice->m_param->bEnableSCC)        slice->m_origSliceType = slice->m_sliceType;
+    if (slice->m_param->bEnableSCC && IS_X265_TYPE_I(type))
+        slice->m_sliceType = P_SLICE;
+#endif
 
     if (type == X265_TYPE_B)
     {
@@ -177,7 +189,8 @@
 
     m_picList.pushFront(*newFrame);
 
-    if (m_bTemporalSublayer && getTemporalLayerNonReferenceFlag())
+    int layer = slice->m_param->numViews > 1 ? newFrame->m_viewId : (slice->m_param->numScalableLayers > 1) ? newFrame->m_sLayerId : 0;
+    if (m_bTemporalSublayer && getTemporalLayerNonReferenceFlag(layer))
     {
         switch (slice->m_nalUnitType)
         {
@@ -195,12 +208,13 @@
         }
     }
     // Do decoding refresh marking if any
-    decodingRefreshMarking(pocCurr, slice->m_nalUnitType);
+    decodingRefreshMarking(pocCurr, slice->m_nalUnitType, layer);
 
-    computeRPS(pocCurr, newFrame->m_tempLayer, slice->isIRAP(), &slice->m_rps, slice->m_sps->maxDecPicBufferingnewFrame->m_tempLayer);
+    uint32_t maxDecBuffer = (slice->m_sps->maxDecPicBufferingnewFrame->m_tempLayer >= 8 && slice->m_param->bEnableSCC) ? 7 : slice->m_sps->maxDecPicBufferingnewFrame->m_tempLayer;
+    computeRPS(pocCurr, newFrame->m_tempLayer, slice->isIRAP(), &slice->m_rps, maxDecBuffer, layer);
     bool isTSAPic = ((slice->m_nalUnitType == 2) || (slice->m_nalUnitType == 3)) ? true : false;
     // Mark pictures in m_piclist as unreferenced if they are not included in RPS
-    applyReferencePictureSet(&slice->m_rps, pocCurr, newFrame->m_tempLayer, isTSAPic);
+    applyReferencePictureSet(&slice->m_rps, pocCurr, newFrame->m_tempLayer, isTSAPic, layer);
 
 
     if (m_bTemporalSublayer && newFrame->m_tempLayer > 0
@@ -210,9 +224,9 @@
             || slice->m_nalUnitType == NAL_UNIT_CODED_SLICE_RASL_R)
         )
     {
-        if (isTemporalLayerSwitchingPoint(pocCurr, newFrame->m_tempLayer) || (slice->m_sps->maxTempSubLayers == 1))
+        if (isTemporalLayerSwitchingPoint(pocCurr, newFrame->m_tempLayer, layer) || (slice->m_sps->maxTempSubLayers == 1))
         {
-            if (getTemporalLayerNonReferenceFlag())
+            if (getTemporalLayerNonReferenceFlag(layer))
             {
                 slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_TSA_N;
             }
@@ -221,7 +235,7 @@
                 slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_TSA_R;
             }
         }
-        else if (isStepwiseTemporalLayerSwitchingPoint(&slice->m_rps, pocCurr, newFrame->m_tempLayer))
+        else if (isStepwiseTemporalLayerSwitchingPoint(&slice->m_rps, pocCurr, newFrame->m_tempLayer, layer))
         {
             bool isSTSA = true;
             int id = newFrame->m_gopOffset % x265_gop_ra_lengthnewFrame->m_gopId;
@@ -254,7 +268,7 @@
             }
             if (isSTSA == true)
             {
-                if (getTemporalLayerNonReferenceFlag())
+                if (getTemporalLayerNonReferenceFlag(layer))
                 {
                     slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_STSA_N;
                 }
@@ -266,12 +280,22 @@
         }
     }
 
+#if ENABLE_MULTIVIEW
+    if (newFrame->m_viewId)
+        slice->createInterLayerReferencePictureSet(m_picList, newFrame->refPicSetInterLayer0, newFrame->refPicSetInterLayer1);
+#endif
+    int numRef = slice->m_param->bEnableSCC ? slice->m_rps.numberOfNegativePictures + 1 : slice->m_rps.numberOfNegativePictures;
     if (slice->m_sliceType != I_SLICE)
-        slice->m_numRefIdx0 = x265_clip3(1, newFrame->m_param->maxNumReferences, slice->m_rps.numberOfNegativePictures);
+        slice->m_numRefIdx0 = x265_clip3(1, newFrame->m_param->maxNumReferences, numRef + newFrame->refPicSetInterLayer0.size() + newFrame->refPicSetInterLayer1.size());
+    else
+        slice->m_numRefIdx0 = X265_MIN(newFrame->m_param->maxNumReferences, numRef); // Ensuring L0 contains just the -ve POC
+#if ENABLE_MULTIVIEW || ENABLE_SCC_EXT
+    if(slice->m_param->numViews > 1 || !!slice->m_param->bEnableSCC)
+        slice->m_numRefIdx1 = X265_MIN(newFrame->m_param->bBPyramid ? 3 : 2, slice->m_rps.numberOfPositivePictures + newFrame->refPicSetInterLayer0.size() + newFrame->refPicSetInterLayer1.size());
     else
-        slice->m_numRefIdx0 = X265_MIN(newFrame->m_param->maxNumReferences, slice->m_rps.numberOfNegativePictures); // Ensuring L0 contains just the -ve POC
-    slice->m_numRefIdx1 = X265_MIN(newFrame->m_param->bBPyramid ? 2 : 1, slice->m_rps.numberOfPositivePictures);
-    slice->setRefPicList(m_picList);
+#endif
+        slice->m_numRefIdx1 = X265_MIN(newFrame->m_param->bBPyramid ? 2 : 1, slice->m_rps.numberOfPositivePictures);
+    slice->setRefPicList(m_picList, newFrame->refPicSetInterLayer0, newFrame->refPicSetInterLayer1, layer);
 
     X265_CHECK(slice->m_sliceType != B_SLICE || slice->m_numRefIdx1, "B slice without L1 references (non-fatal)\n");
 
@@ -280,9 +304,29 @@
         /* TODO: the lookahead should be able to tell which reference picture
          * had the least motion residual.  We should be able to use that here to
          * select a colocation reference list and index */
-        slice->m_colFromL0Flag = false;
+
+        bool bLowDelay = true;
+        int  iCurrPOC = slice->m_poc;
+        int iRefIdx = 0;
+
+        for (iRefIdx = 0; iRefIdx < slice->m_numRefIdx0 && bLowDelay; iRefIdx++)
+        {
+            if (slice->m_refPOCList0iRefIdx > iCurrPOC)
+            {
+                bLowDelay = false;
+            }
+        }
+        for (iRefIdx = 0; iRefIdx < slice->m_numRefIdx1 && bLowDelay; iRefIdx++)
+        {
+            if (slice->m_refPOCList1iRefIdx > iCurrPOC)
+            {
+                bLowDelay = false;
+            }
+        }
+
+        slice->m_bCheckLDC = bLowDelay;
+        slice->m_colFromL0Flag = bLowDelay;
         slice->m_colRefIdx = 0;
-        slice->m_bCheckLDC = false;
     }
     else
     {
@@ -291,6 +335,59 @@
         slice->m_colRefIdx = 0;
     }
 
+    slice->m_bTemporalMvp = slice->m_sps->bTemporalMVPEnabled;
+#if ENABLE_SCC_EXT
+    bool bGPBcheck = false;
+    if (slice->m_sliceType == B_SLICE)
+    {
+        if (slice->m_param->bEnableSCC)
+        {
+            if (slice->m_numRefIdx0 - 1 == slice->m_numRefIdx1)
+            {
+                bGPBcheck = true;
+                for (int i = 0; i < slice->m_numRefIdx1; i++)
+                {
+                    if (slice->m_refPOCList1i != slice->m_refPOCList0i)
+                    {
+                        bGPBcheck = false;
+                        break;
+                    }
+                }
+            }
+        }
+        else if (slice->m_numRefIdx0 == slice->m_numRefIdx1)
+        {
+            bGPBcheck = true;
+            int i;
+            for (i = 0; i < slice->m_numRefIdx1; i++)

 
@@ -53,8 +53,8 @@
         FrameData* next = m_frameDataFreeList->m_freeListNext;
         m_frameDataFreeList->destroy();
 
-        m_frameDataFreeList->m_reconPic->destroy();
-        delete m_frameDataFreeList->m_reconPic;
+        m_frameDataFreeList->m_reconPic0->destroy();
+        delete m_frameDataFreeList->m_reconPic0;
 
         delete m_frameDataFreeList;
         m_frameDataFreeList = next;
@@ -75,7 +75,7 @@
         if (curFrame->m_param->bEnableTemporalFilter)
             isMCSTFReferenced =!!(curFrame->m_refPicCnt1);
 
-        if (!curFrame->m_encData->m_bHasReferences && !curFrame->m_countRefEncoders && !isMCSTFReferenced)
+        if (curFrame->m_valid && !curFrame->m_encData->m_bHasReferences && !curFrame->m_countRefEncoders && !isMCSTFReferenced)
         {
             curFrame->m_bChromaExtended = false;
 
@@ -95,6 +95,12 @@
 
             // iterator is invalidated by remove, restart scan
             m_picList.remove(*curFrame);
+#if ENABLE_MULTIVIEW
+            if (curFrame->m_param->numViews > 1 && !curFrame->m_viewId && m_picList.getPOC(curFrame->m_poc, 1) && curFrame == m_picList.getPOC(curFrame->m_poc, 1)->refPicSetInterLayer0.getPOC(curFrame->m_poc, curFrame->m_viewId))
+            {
+                m_picList.getPOC(curFrame->m_poc, 1)->refPicSetInterLayer0.removeSubDPB(*curFrame);
+            }
+#endif
             iterFrame = m_picList.first();
 
             m_freeList.pushBack(*curFrame);
@@ -126,7 +132,8 @@
                 curFrame->m_prevCtuInfoChange = NULL;
             }
             curFrame->m_encData = NULL;
-            curFrame->m_reconPic = NULL;
+            for (int i = 0; i < !!curFrame->m_param->bEnableSCC + 1; i++)
+                curFrame->m_reconPici = NULL;
         }
     }
 }
@@ -145,6 +152,11 @@
         m_lastIDR = pocCurr;
     slice->m_lastIDR = m_lastIDR;
     slice->m_sliceType = IS_X265_TYPE_B(type) ? B_SLICE : (type == X265_TYPE_P) ? P_SLICE : I_SLICE;
+#if ENABLE_SCC_EXT
+    if (slice->m_param->bEnableSCC)        slice->m_origSliceType = slice->m_sliceType;
+    if (slice->m_param->bEnableSCC && IS_X265_TYPE_I(type))
+        slice->m_sliceType = P_SLICE;
+#endif
 
     if (type == X265_TYPE_B)
     {
@@ -177,7 +189,8 @@
 
     m_picList.pushFront(*newFrame);
 
-    if (m_bTemporalSublayer && getTemporalLayerNonReferenceFlag())
+    int layer = slice->m_param->numViews > 1 ? newFrame->m_viewId : (slice->m_param->numScalableLayers > 1) ? newFrame->m_sLayerId : 0;
+    if (m_bTemporalSublayer && getTemporalLayerNonReferenceFlag(layer))
     {
         switch (slice->m_nalUnitType)
         {
@@ -195,12 +208,13 @@
         }
     }
     // Do decoding refresh marking if any
-    decodingRefreshMarking(pocCurr, slice->m_nalUnitType);
+    decodingRefreshMarking(pocCurr, slice->m_nalUnitType, layer);
 
-    computeRPS(pocCurr, newFrame->m_tempLayer, slice->isIRAP(), &slice->m_rps, slice->m_sps->maxDecPicBufferingnewFrame->m_tempLayer);
+    uint32_t maxDecBuffer = (slice->m_sps->maxDecPicBufferingnewFrame->m_tempLayer >= 8 && slice->m_param->bEnableSCC) ? 7 : slice->m_sps->maxDecPicBufferingnewFrame->m_tempLayer;
+    computeRPS(pocCurr, newFrame->m_tempLayer, slice->isIRAP(), &slice->m_rps, maxDecBuffer, layer);
     bool isTSAPic = ((slice->m_nalUnitType == 2) || (slice->m_nalUnitType == 3)) ? true : false;
     // Mark pictures in m_piclist as unreferenced if they are not included in RPS
-    applyReferencePictureSet(&slice->m_rps, pocCurr, newFrame->m_tempLayer, isTSAPic);
+    applyReferencePictureSet(&slice->m_rps, pocCurr, newFrame->m_tempLayer, isTSAPic, layer);
 
 
     if (m_bTemporalSublayer && newFrame->m_tempLayer > 0
@@ -210,9 +224,9 @@
             || slice->m_nalUnitType == NAL_UNIT_CODED_SLICE_RASL_R)
         )
     {
-        if (isTemporalLayerSwitchingPoint(pocCurr, newFrame->m_tempLayer) || (slice->m_sps->maxTempSubLayers == 1))
+        if (isTemporalLayerSwitchingPoint(pocCurr, newFrame->m_tempLayer, layer) || (slice->m_sps->maxTempSubLayers == 1))
         {
-            if (getTemporalLayerNonReferenceFlag())
+            if (getTemporalLayerNonReferenceFlag(layer))
             {
                 slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_TSA_N;
             }
@@ -221,7 +235,7 @@
                 slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_TSA_R;
             }
         }
-        else if (isStepwiseTemporalLayerSwitchingPoint(&slice->m_rps, pocCurr, newFrame->m_tempLayer))
+        else if (isStepwiseTemporalLayerSwitchingPoint(&slice->m_rps, pocCurr, newFrame->m_tempLayer, layer))
         {
             bool isSTSA = true;
             int id = newFrame->m_gopOffset % x265_gop_ra_lengthnewFrame->m_gopId;
@@ -254,7 +268,7 @@
             }
             if (isSTSA == true)
             {
-                if (getTemporalLayerNonReferenceFlag())
+                if (getTemporalLayerNonReferenceFlag(layer))
                 {
                     slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_STSA_N;
                 }
@@ -266,12 +280,22 @@
         }
     }
 
+#if ENABLE_MULTIVIEW
+    if (newFrame->m_viewId)
+        slice->createInterLayerReferencePictureSet(m_picList, newFrame->refPicSetInterLayer0, newFrame->refPicSetInterLayer1);
+#endif
+    int numRef = slice->m_param->bEnableSCC ? slice->m_rps.numberOfNegativePictures + 1 : slice->m_rps.numberOfNegativePictures;
     if (slice->m_sliceType != I_SLICE)
-        slice->m_numRefIdx0 = x265_clip3(1, newFrame->m_param->maxNumReferences, slice->m_rps.numberOfNegativePictures);
+        slice->m_numRefIdx0 = x265_clip3(1, newFrame->m_param->maxNumReferences, numRef + newFrame->refPicSetInterLayer0.size() + newFrame->refPicSetInterLayer1.size());
+    else
+        slice->m_numRefIdx0 = X265_MIN(newFrame->m_param->maxNumReferences, numRef); // Ensuring L0 contains just the -ve POC
+#if ENABLE_MULTIVIEW || ENABLE_SCC_EXT
+    if(slice->m_param->numViews > 1 || !!slice->m_param->bEnableSCC)
+        slice->m_numRefIdx1 = X265_MIN(newFrame->m_param->bBPyramid ? 3 : 2, slice->m_rps.numberOfPositivePictures + newFrame->refPicSetInterLayer0.size() + newFrame->refPicSetInterLayer1.size());
     else
-        slice->m_numRefIdx0 = X265_MIN(newFrame->m_param->maxNumReferences, slice->m_rps.numberOfNegativePictures); // Ensuring L0 contains just the -ve POC
-    slice->m_numRefIdx1 = X265_MIN(newFrame->m_param->bBPyramid ? 2 : 1, slice->m_rps.numberOfPositivePictures);
-    slice->setRefPicList(m_picList);
+#endif
+        slice->m_numRefIdx1 = X265_MIN(newFrame->m_param->bBPyramid ? 2 : 1, slice->m_rps.numberOfPositivePictures);
+    slice->setRefPicList(m_picList, newFrame->refPicSetInterLayer0, newFrame->refPicSetInterLayer1, layer);
 
     X265_CHECK(slice->m_sliceType != B_SLICE || slice->m_numRefIdx1, "B slice without L1 references (non-fatal)\n");
 
@@ -280,9 +304,29 @@
         /* TODO: the lookahead should be able to tell which reference picture
          * had the least motion residual.  We should be able to use that here to
          * select a colocation reference list and index */
-        slice->m_colFromL0Flag = false;
+
+        bool bLowDelay = true;
+        int  iCurrPOC = slice->m_poc;
+        int iRefIdx = 0;
+
+        for (iRefIdx = 0; iRefIdx < slice->m_numRefIdx0 && bLowDelay; iRefIdx++)
+        {
+            if (slice->m_refPOCList0iRefIdx > iCurrPOC)
+            {
+                bLowDelay = false;
+            }
+        }
+        for (iRefIdx = 0; iRefIdx < slice->m_numRefIdx1 && bLowDelay; iRefIdx++)
+        {
+            if (slice->m_refPOCList1iRefIdx > iCurrPOC)
+            {
+                bLowDelay = false;
+            }
+        }
+
+        slice->m_bCheckLDC = bLowDelay;
+        slice->m_colFromL0Flag = bLowDelay;
         slice->m_colRefIdx = 0;
-        slice->m_bCheckLDC = false;
     }
     else
     {
@@ -291,6 +335,59 @@
         slice->m_colRefIdx = 0;
     }
 
+    slice->m_bTemporalMvp = slice->m_sps->bTemporalMVPEnabled;
+#if ENABLE_SCC_EXT
+    bool bGPBcheck = false;
+    if (slice->m_sliceType == B_SLICE)
+    {
+        if (slice->m_param->bEnableSCC)
+        {
+            if (slice->m_numRefIdx0 - 1 == slice->m_numRefIdx1)
+            {
+                bGPBcheck = true;
+                for (int i = 0; i < slice->m_numRefIdx1; i++)
+                {
+                    if (slice->m_refPOCList1i != slice->m_refPOCList0i)
+                    {
+                        bGPBcheck = false;
+                        break;
+                    }
+                }
+            }
+        }
+        else if (slice->m_numRefIdx0 == slice->m_numRefIdx1)
+        {
+            bGPBcheck = true;
+            int i;
+            for (i = 0; i < slice->m_numRefIdx1; i++)
​

x265_3.6.tar.gz/source/encoder/dpb.h -> x265_4.0.tar.gz/source/encoder/dpb.h Changed

@@ -79,13 +79,13 @@
 
 protected:
 
-    void computeRPS(int curPoc,int tempId, bool isRAP, RPS * rps, unsigned int maxDecPicBuffer);
+    void computeRPS(int curPoc,int tempId, bool isRAP, RPS * rps, unsigned int maxDecPicBuffer, int sLayerId);
 
-    void applyReferencePictureSet(RPS *rps, int curPoc, int tempId, bool isTSAPicture);
-    bool getTemporalLayerNonReferenceFlag();
-    void decodingRefreshMarking(int pocCurr, NalUnitType nalUnitType);
-    bool isTemporalLayerSwitchingPoint(int curPoc, int tempId);
-    bool isStepwiseTemporalLayerSwitchingPoint(RPS *rps, int curPoc, int tempId);
+    void applyReferencePictureSet(RPS *rps, int curPoc, int tempId, bool isTSAPicture, int sLayerId);
+    bool getTemporalLayerNonReferenceFlag(int sLayerId);
+    void decodingRefreshMarking(int pocCurr, NalUnitType nalUnitType, int sLayerId);
+    bool isTemporalLayerSwitchingPoint(int curPoc, int tempId, int sLayerId);
+    bool isStepwiseTemporalLayerSwitchingPoint(RPS *rps, int curPoc, int tempId, int sLayerId);
 
     NalUnitType getNalUnitType(int curPoc, bool bIsKeyFrame);
 };

 
@@ -79,13 +79,13 @@
 
 protected:
 
-    void computeRPS(int curPoc,int tempId, bool isRAP, RPS * rps, unsigned int maxDecPicBuffer);
+    void computeRPS(int curPoc,int tempId, bool isRAP, RPS * rps, unsigned int maxDecPicBuffer, int sLayerId);
 
-    void applyReferencePictureSet(RPS *rps, int curPoc, int tempId, bool isTSAPicture);
-    bool getTemporalLayerNonReferenceFlag();
-    void decodingRefreshMarking(int pocCurr, NalUnitType nalUnitType);
-    bool isTemporalLayerSwitchingPoint(int curPoc, int tempId);
-    bool isStepwiseTemporalLayerSwitchingPoint(RPS *rps, int curPoc, int tempId);
+    void applyReferencePictureSet(RPS *rps, int curPoc, int tempId, bool isTSAPicture, int sLayerId);
+    bool getTemporalLayerNonReferenceFlag(int sLayerId);
+    void decodingRefreshMarking(int pocCurr, NalUnitType nalUnitType, int sLayerId);
+    bool isTemporalLayerSwitchingPoint(int curPoc, int tempId, int sLayerId);
+    bool isStepwiseTemporalLayerSwitchingPoint(RPS *rps, int curPoc, int tempId, int sLayerId);
 
     NalUnitType getNalUnitType(int curPoc, bool bIsKeyFrame);
 };
​

x265_3.6.tar.gz/source/encoder/encoder.cpp -> x265_4.0.tar.gz/source/encoder/encoder.cpp Changed

@@ -134,7 +134,6 @@
     m_lookahead = NULL;
     m_rateControl = NULL;
     m_dpb = NULL;
-    m_exportedPic = NULL;
     m_numDelayedPic = 0;
     m_outputCount = 0;
     m_param = NULL;
@@ -150,6 +149,8 @@
     m_rpsInSpsCount = 0;
     m_cB = 1.0;
     m_cR = 1.0;
+    for (int i = 0; i < MAX_LAYERS; i++)
+        m_exportedPici = NULL;
     for (int i = 0; i < X265_MAX_FRAME_THREADS; i++)
         m_frameEncoderi = NULL;
     for (uint32_t i = 0; i < DUP_BUFFER; i++)
@@ -597,9 +598,9 @@
     }
 }
 
-int Encoder::copySlicetypePocAndSceneCut(int *slicetype, int *poc, int *sceneCut)
+int Encoder::copySlicetypePocAndSceneCut(int *slicetype, int *poc, int *sceneCut, int sLayer)
 {
-    Frame *FramePtr = m_dpb->m_picList.getCurFrame();
+    Frame *FramePtr = m_dpb->m_picList.getCurFrame(sLayer);
     if (FramePtr != NULL)
     {
         *slicetype = FramePtr->m_lowres.sliceType;
@@ -618,31 +619,36 @@
 {
     if (!(IS_X265_TYPE_I(sliceType)))
     {
-        Frame *framePtr = m_dpb->m_picList.getPOC(poc);
+        Frame *framePtr = m_dpb->m_picList.getPOC(poc, 0);
         if (framePtr != NULL)
         {
             for (int j = 0; j < framePtr->m_encData->m_slice->m_numRefIdx0; j++)    // check only for --ref=n number of frames.
             {
-                if (framePtr->m_encData->m_slice->m_refFrameList0j && framePtr->m_encData->m_slice->m_refFrameList0j->m_reconPic != NULL)
+                if (framePtr->m_encData->m_slice->m_refFrameList0j && framePtr->m_encData->m_slice->m_refFrameList0j->m_reconPic0 != NULL)
                 {
                     int l0POC = framePtr->m_encData->m_slice->m_refFrameList0j->m_poc;
                     pocL0j = l0POC;
-                    Frame* l0Fp = m_dpb->m_picList.getPOC(l0POC);
-                    while (l0Fp->m_reconRowFlagl0Fp->m_numRows - 1.get() == 0)
-                        l0Fp->m_reconRowFlagl0Fp->m_numRows - 1.waitForChange(0); /* If recon is not ready, current frame encoder has to wait. */
-                    l0j = l0Fp->m_reconPic;
+                    Frame* l0Fp = m_dpb->m_picList.getPOC(l0POC, 0);
+#if ENABLE_SCC_EXT
+                    if (l0POC != poc)
+#endif
+                    {
+                        while (l0Fp->m_reconRowFlagl0Fp->m_numRows - 1.get() == 0)
+                            l0Fp->m_reconRowFlagl0Fp->m_numRows - 1.waitForChange(0); /* If recon is not ready, current frame encoder has to wait. */
+                    }
+                    l0j = l0Fp->m_reconPic0;
                 }
             }
             for (int j = 0; j < framePtr->m_encData->m_slice->m_numRefIdx1; j++)    // check only for --ref=n number of frames.
             {
-                if (framePtr->m_encData->m_slice->m_refFrameList1j && framePtr->m_encData->m_slice->m_refFrameList1j->m_reconPic != NULL)
+                if (framePtr->m_encData->m_slice->m_refFrameList1j && framePtr->m_encData->m_slice->m_refFrameList1j->m_reconPic0 != NULL)
                 {
                     int l1POC = framePtr->m_encData->m_slice->m_refFrameList1j->m_poc;
                     pocL1j = l1POC;
-                    Frame* l1Fp = m_dpb->m_picList.getPOC(l1POC);
+                    Frame* l1Fp = m_dpb->m_picList.getPOC(l1POC, 0);
                     while (l1Fp->m_reconRowFlagl1Fp->m_numRows - 1.get() == 0)
                         l1Fp->m_reconRowFlagl1Fp->m_numRows - 1.waitForChange(0); /* If recon is not ready, current frame encoder has to wait. */
-                    l1j = l1Fp->m_reconPic;
+                    l1j = l1Fp->m_reconPic0;
                 }
             }
         }
@@ -762,7 +768,7 @@
     uint32_t widthInCU = (m_param->sourceWidth + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize;
     uint32_t heightInCU = (m_param->sourceHeight + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize;
 
-    Frame* curFrame = m_dpb->m_picList.getPOC(poc);
+    Frame* curFrame = m_dpb->m_picList.getPOC(poc, 0);
     if (curFrame != NULL)
     {
         curFrame->m_analysisData = (*analysis_data);
@@ -861,10 +867,13 @@
         X265_FREE(m_rdCost);
         X265_FREE(m_trainingCount);
     }
-    if (m_exportedPic)
+    for (int layer = 0; layer < m_param->numLayers; layer++)
     {
-        ATOMIC_DEC(&m_exportedPic->m_countRefEncoders);
-        m_exportedPic = NULL;
+        if (m_exportedPiclayer)
+        {
+            ATOMIC_DEC(&m_exportedPiclayer->m_countRefEncoders);
+            m_exportedPiclayer = NULL;
+        }
     }
 
     if (m_param->bEnableFrameDuplication)
@@ -1359,6 +1368,10 @@
     memcpy(dest->planes0, src->planes0, src->framesize * sizeof(char));
     dest->planes1 = (char*)dest->planes0 + src->stride0 * src->height;
     dest->planes2 = (char*)dest->planes1 + src->stride1 * (src->height >> x265_cli_cspssrc->colorSpace.height1);
+#if ENABLE_ALPHA
+    if(m_param->bEnableAlpha)
+        dest->planes3 = (char*)dest->planes2 + src->stride2 * (src->height >> x265_cli_cspssrc->colorSpace.height2);
+#endif
 }
 
 bool Encoder::isFilterThisframe(uint8_t sliceTypeConfig, int curSliceType)
@@ -1458,7 +1471,7 @@
  * returns 0 if no frames are currently available for output
  *         1 if frame was output, m_nalList contains access unit
  *         negative on malloc error or abort */
-int Encoder::encode(const x265_picture* pic_in, x265_picture* pic_out)
+int Encoder::encode(const x265_picture* pic_in, x265_picture** pic_out)
 {
 #if CHECKED_BUILD || _DEBUG
     if (g_checkFailures)
@@ -1470,19 +1483,21 @@
     if (m_aborted)
         return -1;
 
-    const x265_picture* inputPic = NULL;
+    const x265_picture* inputPicMAX_VIEWS = { NULL };
     static int written = 0, read = 0;
     bool dontRead = false;
     bool dropflag = false;
 
-    if (m_exportedPic)
+    if (*m_exportedPic)
     {
         if (!m_param->bUseAnalysisFile && m_param->analysisSave)
-            x265_free_analysis_data(m_param, &m_exportedPic->m_analysisData);
-
-        ATOMIC_DEC(&m_exportedPic->m_countRefEncoders);
+            x265_free_analysis_data(m_param, &m_exportedPic0->m_analysisData);
 
-        m_exportedPic = NULL;
+        for (int i = 0; i < m_param->numLayers; i++)
+        {
+            ATOMIC_DEC(&m_exportedPici->m_countRefEncoders);
+            m_exportedPici = NULL;
+        }
         m_dpb->recycleUnreferenced();
 
         if (m_param->bEnableTemporalFilter)
@@ -1566,143 +1581,194 @@
 
             if (read < written)
             {
-                inputPic = m_dupBuffer0->dupPic;
+                inputPic0 = m_dupBuffer0->dupPic;
                 read++;
             }
         }
         else
-            inputPic = pic_in;
+        {
+            for (int view = 0; view < m_param->numViews; view++)
+                inputPicview = pic_in + view;
+        }
 
-        Frame *inFrame;
-        x265_param *p = (m_reconfigure || m_reconfigureRc) ? m_latestParam : m_param;
-        if (m_dpb->m_freeList.empty())
-        {
-            inFrame = new Frame;
-            inFrame->m_encodeStartTime = x265_mdate();
-            if (inFrame->create(p, inputPic->quantOffsets))
-            {
-                /* the first PicYuv created is asked to generate the CU and block unit offset
-                 * arrays which are then shared with all subsequent PicYuv (orig and recon) 
-                 * allocated by this top level encoder */
-                if (m_sps.cuOffsetY)
-                {
-                    inFrame->m_fencPic->m_cuOffsetY = m_sps.cuOffsetY;
-                    inFrame->m_fencPic->m_buOffsetY = m_sps.buOffsetY;
-                    if (m_param->internalCsp != X265_CSP_I400)
-                    {
-                        inFrame->m_fencPic->m_cuOffsetC = m_sps.cuOffsetC;
-                        inFrame->m_fencPic->m_buOffsetC = m_sps.buOffsetC;
-                    }
-                }
-                else
+        x265_param* p = (m_reconfigure || m_reconfigureRc) ? m_latestParam : m_param;
+        Frame* inFrameMAX_LAYERS;
+        for (int layer = 0; layer < m_param->numLayers; layer++)
+        {
+            if (m_dpb->m_freeList.empty())
+            {
+                inFramelayer = new Frame;
+                inFramelayer->m_encodeStartTime = x265_mdate();
+#if ENABLE_MULTIVIEW
+                inFramelayer->m_viewId = m_param->numViews > 1 ? layer : 0;
+#endif
+#if ENABLE_ALPHA
+                inFramelayer->m_sLayerId = m_param->numScalableLayers > 1 ? layer : 0;

 
@@ -134,7 +134,6 @@
     m_lookahead = NULL;
     m_rateControl = NULL;
     m_dpb = NULL;
-    m_exportedPic = NULL;
     m_numDelayedPic = 0;
     m_outputCount = 0;
     m_param = NULL;
@@ -150,6 +149,8 @@
     m_rpsInSpsCount = 0;
     m_cB = 1.0;
     m_cR = 1.0;
+    for (int i = 0; i < MAX_LAYERS; i++)
+        m_exportedPici = NULL;
     for (int i = 0; i < X265_MAX_FRAME_THREADS; i++)
         m_frameEncoderi = NULL;
     for (uint32_t i = 0; i < DUP_BUFFER; i++)
@@ -597,9 +598,9 @@
     }
 }
 
-int Encoder::copySlicetypePocAndSceneCut(int *slicetype, int *poc, int *sceneCut)
+int Encoder::copySlicetypePocAndSceneCut(int *slicetype, int *poc, int *sceneCut, int sLayer)
 {
-    Frame *FramePtr = m_dpb->m_picList.getCurFrame();
+    Frame *FramePtr = m_dpb->m_picList.getCurFrame(sLayer);
     if (FramePtr != NULL)
     {
         *slicetype = FramePtr->m_lowres.sliceType;
@@ -618,31 +619,36 @@
 {
     if (!(IS_X265_TYPE_I(sliceType)))
     {
-        Frame *framePtr = m_dpb->m_picList.getPOC(poc);
+        Frame *framePtr = m_dpb->m_picList.getPOC(poc, 0);
         if (framePtr != NULL)
         {
             for (int j = 0; j < framePtr->m_encData->m_slice->m_numRefIdx0; j++)    // check only for --ref=n number of frames.
             {
-                if (framePtr->m_encData->m_slice->m_refFrameList0j && framePtr->m_encData->m_slice->m_refFrameList0j->m_reconPic != NULL)
+                if (framePtr->m_encData->m_slice->m_refFrameList0j && framePtr->m_encData->m_slice->m_refFrameList0j->m_reconPic0 != NULL)
                 {
                     int l0POC = framePtr->m_encData->m_slice->m_refFrameList0j->m_poc;
                     pocL0j = l0POC;
-                    Frame* l0Fp = m_dpb->m_picList.getPOC(l0POC);
-                    while (l0Fp->m_reconRowFlagl0Fp->m_numRows - 1.get() == 0)
-                        l0Fp->m_reconRowFlagl0Fp->m_numRows - 1.waitForChange(0); /* If recon is not ready, current frame encoder has to wait. */
-                    l0j = l0Fp->m_reconPic;
+                    Frame* l0Fp = m_dpb->m_picList.getPOC(l0POC, 0);
+#if ENABLE_SCC_EXT
+                    if (l0POC != poc)
+#endif
+                    {
+                        while (l0Fp->m_reconRowFlagl0Fp->m_numRows - 1.get() == 0)
+                            l0Fp->m_reconRowFlagl0Fp->m_numRows - 1.waitForChange(0); /* If recon is not ready, current frame encoder has to wait. */
+                    }
+                    l0j = l0Fp->m_reconPic0;
                 }
             }
             for (int j = 0; j < framePtr->m_encData->m_slice->m_numRefIdx1; j++)    // check only for --ref=n number of frames.
             {
-                if (framePtr->m_encData->m_slice->m_refFrameList1j && framePtr->m_encData->m_slice->m_refFrameList1j->m_reconPic != NULL)
+                if (framePtr->m_encData->m_slice->m_refFrameList1j && framePtr->m_encData->m_slice->m_refFrameList1j->m_reconPic0 != NULL)
                 {
                     int l1POC = framePtr->m_encData->m_slice->m_refFrameList1j->m_poc;
                     pocL1j = l1POC;
-                    Frame* l1Fp = m_dpb->m_picList.getPOC(l1POC);
+                    Frame* l1Fp = m_dpb->m_picList.getPOC(l1POC, 0);
                     while (l1Fp->m_reconRowFlagl1Fp->m_numRows - 1.get() == 0)
                         l1Fp->m_reconRowFlagl1Fp->m_numRows - 1.waitForChange(0); /* If recon is not ready, current frame encoder has to wait. */
-                    l1j = l1Fp->m_reconPic;
+                    l1j = l1Fp->m_reconPic0;
                 }
             }
         }
@@ -762,7 +768,7 @@
     uint32_t widthInCU = (m_param->sourceWidth + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize;
     uint32_t heightInCU = (m_param->sourceHeight + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize;
 
-    Frame* curFrame = m_dpb->m_picList.getPOC(poc);
+    Frame* curFrame = m_dpb->m_picList.getPOC(poc, 0);
     if (curFrame != NULL)
     {
         curFrame->m_analysisData = (*analysis_data);
@@ -861,10 +867,13 @@
         X265_FREE(m_rdCost);
         X265_FREE(m_trainingCount);
     }
-    if (m_exportedPic)
+    for (int layer = 0; layer < m_param->numLayers; layer++)
     {
-        ATOMIC_DEC(&m_exportedPic->m_countRefEncoders);
-        m_exportedPic = NULL;
+        if (m_exportedPiclayer)
+        {
+            ATOMIC_DEC(&m_exportedPiclayer->m_countRefEncoders);
+            m_exportedPiclayer = NULL;
+        }
     }
 
     if (m_param->bEnableFrameDuplication)
@@ -1359,6 +1368,10 @@
     memcpy(dest->planes0, src->planes0, src->framesize * sizeof(char));
     dest->planes1 = (char*)dest->planes0 + src->stride0 * src->height;
     dest->planes2 = (char*)dest->planes1 + src->stride1 * (src->height >> x265_cli_cspssrc->colorSpace.height1);
+#if ENABLE_ALPHA
+    if(m_param->bEnableAlpha)
+        dest->planes3 = (char*)dest->planes2 + src->stride2 * (src->height >> x265_cli_cspssrc->colorSpace.height2);
+#endif
 }
 
 bool Encoder::isFilterThisframe(uint8_t sliceTypeConfig, int curSliceType)
@@ -1458,7 +1471,7 @@
  * returns 0 if no frames are currently available for output
  *         1 if frame was output, m_nalList contains access unit
  *         negative on malloc error or abort */
-int Encoder::encode(const x265_picture* pic_in, x265_picture* pic_out)
+int Encoder::encode(const x265_picture* pic_in, x265_picture** pic_out)
 {
 #if CHECKED_BUILD || _DEBUG
     if (g_checkFailures)
@@ -1470,19 +1483,21 @@
     if (m_aborted)
         return -1;
 
-    const x265_picture* inputPic = NULL;
+    const x265_picture* inputPicMAX_VIEWS = { NULL };
     static int written = 0, read = 0;
     bool dontRead = false;
     bool dropflag = false;
 
-    if (m_exportedPic)
+    if (*m_exportedPic)
     {
         if (!m_param->bUseAnalysisFile && m_param->analysisSave)
-            x265_free_analysis_data(m_param, &m_exportedPic->m_analysisData);
-
-        ATOMIC_DEC(&m_exportedPic->m_countRefEncoders);
+            x265_free_analysis_data(m_param, &m_exportedPic0->m_analysisData);
 
-        m_exportedPic = NULL;
+        for (int i = 0; i < m_param->numLayers; i++)
+        {
+            ATOMIC_DEC(&m_exportedPici->m_countRefEncoders);
+            m_exportedPici = NULL;
+        }
         m_dpb->recycleUnreferenced();
 
         if (m_param->bEnableTemporalFilter)
@@ -1566,143 +1581,194 @@
 
             if (read < written)
             {
-                inputPic = m_dupBuffer0->dupPic;
+                inputPic0 = m_dupBuffer0->dupPic;
                 read++;
             }
         }
         else
-            inputPic = pic_in;
+        {
+            for (int view = 0; view < m_param->numViews; view++)
+                inputPicview = pic_in + view;
+        }
 
-        Frame *inFrame;
-        x265_param *p = (m_reconfigure || m_reconfigureRc) ? m_latestParam : m_param;
-        if (m_dpb->m_freeList.empty())
-        {
-            inFrame = new Frame;
-            inFrame->m_encodeStartTime = x265_mdate();
-            if (inFrame->create(p, inputPic->quantOffsets))
-            {
-                /* the first PicYuv created is asked to generate the CU and block unit offset
-                 * arrays which are then shared with all subsequent PicYuv (orig and recon) 
-                 * allocated by this top level encoder */
-                if (m_sps.cuOffsetY)
-                {
-                    inFrame->m_fencPic->m_cuOffsetY = m_sps.cuOffsetY;
-                    inFrame->m_fencPic->m_buOffsetY = m_sps.buOffsetY;
-                    if (m_param->internalCsp != X265_CSP_I400)
-                    {
-                        inFrame->m_fencPic->m_cuOffsetC = m_sps.cuOffsetC;
-                        inFrame->m_fencPic->m_buOffsetC = m_sps.buOffsetC;
-                    }
-                }
-                else
+        x265_param* p = (m_reconfigure || m_reconfigureRc) ? m_latestParam : m_param;
+        Frame* inFrameMAX_LAYERS;
+        for (int layer = 0; layer < m_param->numLayers; layer++)
+        {
+            if (m_dpb->m_freeList.empty())
+            {
+                inFramelayer = new Frame;
+                inFramelayer->m_encodeStartTime = x265_mdate();
+#if ENABLE_MULTIVIEW
+                inFramelayer->m_viewId = m_param->numViews > 1 ? layer : 0;
+#endif
+#if ENABLE_ALPHA
+                inFramelayer->m_sLayerId = m_param->numScalableLayers > 1 ? layer : 0;
​

x265_3.6.tar.gz/source/encoder/encoder.h -> x265_4.0.tar.gz/source/encoder/encoder.h Changed

@@ -202,7 +202,7 @@
     ThreadPool*        m_threadPool;
     FrameEncoder*      m_frameEncoderX265_MAX_FRAME_THREADS;
     DPB*               m_dpb;
-    Frame*             m_exportedPic;
+    Frame*             m_exportedPicMAX_LAYERS;
     FILE*              m_analysisFileIn;
     FILE*              m_analysisFileOut;
     FILE*              m_naluFile;
@@ -217,10 +217,10 @@
 
     bool               m_externalFlush;
     /* Collect statistics globally */
-    EncStats           m_analyzeAll;
-    EncStats           m_analyzeI;
-    EncStats           m_analyzeP;
-    EncStats           m_analyzeB;
+    EncStats           m_analyzeAllMAX_LAYERS;
+    EncStats           m_analyzeIMAX_LAYERS;
+    EncStats           m_analyzePMAX_LAYERS;
+    EncStats           m_analyzeBMAX_LAYERS;
     VPS                m_vps;
     SPS                m_sps;
     PPS                m_pps;
@@ -300,7 +300,7 @@
     void stopJobs();
     void destroy();
 
-    int encode(const x265_picture* pic, x265_picture *pic_out);
+    int encode(const x265_picture* pic, x265_picture **pic_out);
 
     int reconfigureParam(x265_param* encParam, x265_param* param);
 
@@ -308,7 +308,7 @@
 
     void copyCtuInfo(x265_ctu_info_t** frameCtuInfo, int poc);
 
-    int copySlicetypePocAndSceneCut(int *slicetype, int *poc, int *sceneCut);
+    int copySlicetypePocAndSceneCut(int *slicetype, int *poc, int *sceneCut, int sLayer);
 
     int getRefFrameList(PicYuv** l0, PicYuv** l1, int sliceType, int poc, int* pocL0, int* pocL1);
 
@@ -320,7 +320,7 @@
 
     void getEndNalUnits(NALList& list, Bitstream& bs);
 
-    void fetchStats(x265_stats* stats, size_t statsSizeBytes);
+    void fetchStats(x265_stats* stats, size_t statsSizeBytes, int layer = 0);
 
     void printSummary();
 
@@ -352,7 +352,7 @@
 
     void copyDistortionData(x265_analysis_data* analysis, FrameData &curEncData);
 
-    void finishFrameStats(Frame* pic, FrameEncoder *curEncoder, x265_frame_stats* frameStats, int inPoc);
+    void finishFrameStats(Frame* pic, FrameEncoder *curEncoder, x265_frame_stats* frameStats, int inPoc, int layer);
 
     int validateAnalysisData(x265_analysis_validate* param, int readWriteFlag);

 
@@ -202,7 +202,7 @@
     ThreadPool*        m_threadPool;
     FrameEncoder*      m_frameEncoderX265_MAX_FRAME_THREADS;
     DPB*               m_dpb;
-    Frame*             m_exportedPic;
+    Frame*             m_exportedPicMAX_LAYERS;
     FILE*              m_analysisFileIn;
     FILE*              m_analysisFileOut;
     FILE*              m_naluFile;
@@ -217,10 +217,10 @@
 
     bool               m_externalFlush;
     /* Collect statistics globally */
-    EncStats           m_analyzeAll;
-    EncStats           m_analyzeI;
-    EncStats           m_analyzeP;
-    EncStats           m_analyzeB;
+    EncStats           m_analyzeAllMAX_LAYERS;
+    EncStats           m_analyzeIMAX_LAYERS;
+    EncStats           m_analyzePMAX_LAYERS;
+    EncStats           m_analyzeBMAX_LAYERS;
     VPS                m_vps;
     SPS                m_sps;
     PPS                m_pps;
@@ -300,7 +300,7 @@
     void stopJobs();
     void destroy();
 
-    int encode(const x265_picture* pic, x265_picture *pic_out);
+    int encode(const x265_picture* pic, x265_picture **pic_out);
 
     int reconfigureParam(x265_param* encParam, x265_param* param);
 
@@ -308,7 +308,7 @@
 
     void copyCtuInfo(x265_ctu_info_t** frameCtuInfo, int poc);
 
-    int copySlicetypePocAndSceneCut(int *slicetype, int *poc, int *sceneCut);
+    int copySlicetypePocAndSceneCut(int *slicetype, int *poc, int *sceneCut, int sLayer);
 
     int getRefFrameList(PicYuv** l0, PicYuv** l1, int sliceType, int poc, int* pocL0, int* pocL1);
 
@@ -320,7 +320,7 @@
 
     void getEndNalUnits(NALList& list, Bitstream& bs);
 
-    void fetchStats(x265_stats* stats, size_t statsSizeBytes);
+    void fetchStats(x265_stats* stats, size_t statsSizeBytes, int layer = 0);
 
     void printSummary();
 
@@ -352,7 +352,7 @@
 
     void copyDistortionData(x265_analysis_data* analysis, FrameData &curEncData);
 
-    void finishFrameStats(Frame* pic, FrameEncoder *curEncoder, x265_frame_stats* frameStats, int inPoc);
+    void finishFrameStats(Frame* pic, FrameEncoder *curEncoder, x265_frame_stats* frameStats, int inPoc, int layer);
 
     int validateAnalysisData(x265_analysis_validate* param, int readWriteFlag);
 
​

x265_3.6.tar.gz/source/encoder/entropy.cpp -> x265_4.0.tar.gz/source/encoder/entropy.cpp Changed

@@ -230,11 +230,12 @@
     X265_CHECK(sizeof(m_contextState) >= sizeof(m_contextState0) * MAX_OFF_CTX_MOD, "context state table is too small\n");
 }
 
-void Entropy::codeVPS(const VPS& vps)
+void Entropy::codeVPS(const VPS& vps, const SPS& sps)
 {
+    int maxLayers = (vps.m_numLayers > 1 || vps.m_numViews > 1) + 1;
     WRITE_CODE(0,       4, "vps_video_parameter_set_id");
     WRITE_CODE(3,       2, "vps_reserved_three_2bits");
-    WRITE_CODE(0,       6, "vps_reserved_zero_6bits");
+    WRITE_CODE(maxLayers - 1, 6, "vps_reserved_zero_6bits");
     WRITE_CODE(vps.maxTempSubLayers - 1, 3, "vps_max_sub_layers_minus1");
     WRITE_FLAG(vps.maxTempSubLayers == 1,   "vps_temporal_id_nesting_flag");
     WRITE_CODE(0xffff, 16, "vps_reserved_ffff_16bits");
@@ -250,50 +251,320 @@
         WRITE_UVLC(vps.maxLatencyIncreasei + 1, "vps_max_latency_increase_plus1i");
     }
 
+#if ENABLE_ALPHA || ENABLE_MULTIVIEW
+    if (vps.m_numLayers > 1 || vps.m_numViews > 1)
+    {
+        WRITE_CODE(maxLayers - 1, 6, "vps_max_nuh_reserved_zero_layer_id");
+        WRITE_UVLC(vps.m_vpsNumLayerSetsMinus1, "vps_num_layer_sets_minus1");
+        for (int i = 1; i <= vps.m_vpsNumLayerSetsMinus1; i++)
+        {
+#if ENABLE_MULTIVIEW
+            if (vps.m_numViews > 1)
+            {
+                for (int j = 0; j < vps.m_numViews; j++)
+                {
+                    WRITE_FLAG(1, "layer_id_included_flagopsIdxi");
+                }
+            }
+#endif
+#if ENABLE_ALPHA
+            if (vps.m_numLayers > 1)
+            {
+                for (int j = 0; j < vps.m_numLayers; j++)
+                {
+                    WRITE_FLAG(1, "layer_id_included_flagopsIdxi");
+                }
+            }
+#endif
+        }
+    }
+    else
+    {
+        WRITE_CODE(0, 6, "vps_max_nuh_reserved_zero_layer_id");
+        WRITE_UVLC(0, "vps_max_op_sets_minus1");
+    }
+#else
     WRITE_CODE(0, 6, "vps_max_nuh_reserved_zero_layer_id");
-    WRITE_UVLC(0,    "vps_max_op_sets_minus1");
+    WRITE_UVLC(0, "vps_max_op_sets_minus1");
+#endif
+
     WRITE_FLAG(0,    "vps_timing_info_present_flag"); /* we signal timing info in SPS-VUI */
-    WRITE_FLAG(0,    "vps_extension_flag");
+
+#if ENABLE_ALPHA || ENABLE_MULTIVIEW
+    if (vps.m_numLayers > 1 || vps.m_numViews > 1)
+    {
+        WRITE_FLAG(vps.vps_extension_flag, "vps_extension_flag");
+
+        if (vps.vps_extension_flag)
+        {
+            while (m_bitIf->getNumberOfWrittenBits() % X265_BYTE != 0)
+            {
+                WRITE_FLAG(1, "vps_extension_alignment_bit_equal_to_one");
+            }
+
+            WRITE_CODE(vps.ptl.levelIdc, 8, "general_level_idc");
+            if (vps.maxTempSubLayers > 1)
+            {
+                for (int i = 0; i < vps.maxTempSubLayers - 1; i++)
+                {
+                    WRITE_FLAG(0, "sub_layer_profile_present_flagi");
+                    WRITE_FLAG(0, "sub_layer_level_present_flagi");
+                }
+                for (int i = vps.maxTempSubLayers - 1; i < 8; i++)
+                    WRITE_CODE(0, 2, "reserved_zero_2bits");
+            }
+
+            WRITE_FLAG(vps.splitting_flag, "splitting flag");
+            for (int i = 0; i < MAX_VPS_NUM_SCALABILITY_TYPES; i++)
+            {
+                WRITE_FLAG(vps.m_scalabilityMaski, "scalability_maski");
+            }
+            for (int i = 0; i < vps.scalabilityTypes - vps.splitting_flag; i++)
+            {
+                WRITE_CODE(vps.m_dimensionIdLeni - 1, 3, "dimension_id_len_minus1i");
+            }
+            WRITE_FLAG(vps.m_nuhLayerIdPresentFlag, "vps_nuh_layer_id_present_flag");
+            for (int i = 1; i < maxLayers; i++)
+            {
+                if (vps.m_nuhLayerIdPresentFlag)
+                    WRITE_CODE(vps.m_layerIdInNuhi, 6, "layer_id_in_nuhi");
+
+                if (!vps.splitting_flag)
+                {
+                    for (int j = 0; j < vps.scalabilityTypes; j++)
+                    {
+                        uint8_t bits = vps.m_dimensionIdLenj;
+                        WRITE_CODE(vps.m_dimensionIdij, bits, "dimension_idij");
+                    }
+                }
+            }
+            WRITE_CODE(vps.m_viewIdLen, 4, "view_id_len");
+
+#if ENABLE_ALPHA
+            if (vps.m_numLayers > 1)
+            {
+                WRITE_FLAG(0, "direct_dependency_flag10");
+                WRITE_UVLC(0, "num_add_layer_sets");
+                WRITE_FLAG(0, "vps_sub_layers_max_minus1_present_flag");
+                WRITE_FLAG(0, "max_tid_ref_present_flag");
+                WRITE_FLAG(0, "default_ref_layers_active_flag");
+                WRITE_UVLC(2, "vps_num_profile_tier_level_minus1");
+                WRITE_FLAG(1, "vps_profile_present_flag");
+                codeProfileTier(vps.ptl, vps.maxTempSubLayers, 1);
+
+                WRITE_UVLC(0, "num_add_olss");
+                WRITE_CODE(0, 2, "default_output_layer_idc");
+                WRITE_CODE(1, 2, "profile_tier_level_idx i  j ");
+                WRITE_CODE(2, 2, "profile_tier_level_idx i  j ");
+
+                WRITE_UVLC(0, "vps_num_rep_formats_minus1");
+
+                WRITE_CODE(sps.picWidthInLumaSamples, 16, "pic_width_vps_in_luma_samples");
+                WRITE_CODE(sps.picHeightInLumaSamples, 16, "pic_height_vps_in_luma_samples");
+                WRITE_FLAG(1, "chroma_and_bit_depth_vps_present_flag");
+
+                WRITE_CODE(sps.chromaFormatIdc, 2, "chroma_format_vps_idc");
+
+                if (sps.chromaFormatIdc == X265_CSP_I444)
+                    WRITE_FLAG(0, "separate_colour_plane_vps_flag");
+
+                WRITE_CODE(X265_DEPTH - 8, 4, "bit_depth_vps_luma_minus8");
+                WRITE_CODE(X265_DEPTH - 8, 4, "bit_depth_vps_chroma_minus8");
+
+                const Window& conf = sps.conformanceWindow;
+                WRITE_FLAG(conf.bEnabled, "conformance_window_vps_flag");
+                if (conf.bEnabled)
+                {
+                    int hShift = CHROMA_H_SHIFT(sps.chromaFormatIdc), vShift = CHROMA_V_SHIFT(sps.chromaFormatIdc);
+                    WRITE_UVLC(conf.leftOffset >> hShift, "conf_win_vps_left_offset");
+                    WRITE_UVLC(conf.rightOffset >> hShift, "conf_win_vps_right_offset");
+                    WRITE_UVLC(conf.topOffset >> vShift, "conf_win_vps_top_offset");
+                    WRITE_UVLC(conf.bottomOffset >> vShift, "conf_win_vps_bottom_offset");
+                }
+
+                WRITE_FLAG(1, "max_one_active_ref_layer_flag");
+                WRITE_FLAG(0, "vps_poc_lsb_aligned_flag");
+                WRITE_FLAG(1, "poc_lsb_not_present_flag");
+
+                for (int i = 1; i < vps.m_vpsNumLayerSetsMinus1 + 1; i++)
+                {
+                    WRITE_FLAG(vps.maxTempSubLayers > 1, "sub_layer_flag_info_present_flag");
+                    for (int j = 0; j < vps.maxTempSubLayers ; j++)
+                    {
+                        if(j > 0)
+                        WRITE_FLAG(vps.maxTempSubLayers > 1, "sub_layer_dpb_info_present_flag");
+
+                        for(int k = 0; k < vps.m_numLayersInIdListi; k++)
+                            WRITE_UVLC(vps.maxDecPicBufferingj - 1, "vps_max_dec_pic_buffering_minus1i");
+
+                        WRITE_UVLC(vps.numReorderPics0, "vps_num_reorder_picsi");
+                        WRITE_UVLC(vps.maxLatencyIncrease0 + 1, "vps_max_latency_increase_plus1i");
+                    }
+                }
+
+                WRITE_UVLC(0, "direct_dep_type_len_minus2");
+
+                WRITE_FLAG(0, "default_direct_dependency_flag");
+                WRITE_UVLC(0, "vps_non_vui_extension_length");
+                WRITE_FLAG(0, "vps_vui_present_flag");
+                WRITE_FLAG(0, "vps_extension2_flag");
+        }
+#endif
+
+#if ENABLE_MULTIVIEW
+            if (vps.m_numViews > 1)
+            {
+                for (uint8_t i = 0; i < vps.m_numViews; i++)
+                    WRITE_CODE(i, vps.m_viewIdLen, "view_id_vali");
+
+                for (int i = 1; i < vps.m_numViews; i++)
+                {
+                    for (int j = 0; j < i; j++)
+                    {
+                        if (j == 0)
+                            WRITE_FLAG(1, "direct_dependency_flag10");
+                        else
+                            WRITE_FLAG(0, "direct_dependency_flag10");
+                    }
+                }
+                WRITE_FLAG(0, "vps_sub_layers_max_minus1_present_flag");
+                WRITE_FLAG(0, "max_tid_ref_present_flag");
+                WRITE_FLAG(1, "default_ref_layers_active_flag");

 
@@ -230,11 +230,12 @@
     X265_CHECK(sizeof(m_contextState) >= sizeof(m_contextState0) * MAX_OFF_CTX_MOD, "context state table is too small\n");
 }
 
-void Entropy::codeVPS(const VPS& vps)
+void Entropy::codeVPS(const VPS& vps, const SPS& sps)
 {
+    int maxLayers = (vps.m_numLayers > 1 || vps.m_numViews > 1) + 1;
     WRITE_CODE(0,       4, "vps_video_parameter_set_id");
     WRITE_CODE(3,       2, "vps_reserved_three_2bits");
-    WRITE_CODE(0,       6, "vps_reserved_zero_6bits");
+    WRITE_CODE(maxLayers - 1, 6, "vps_reserved_zero_6bits");
     WRITE_CODE(vps.maxTempSubLayers - 1, 3, "vps_max_sub_layers_minus1");
     WRITE_FLAG(vps.maxTempSubLayers == 1,   "vps_temporal_id_nesting_flag");
     WRITE_CODE(0xffff, 16, "vps_reserved_ffff_16bits");
@@ -250,50 +251,320 @@
         WRITE_UVLC(vps.maxLatencyIncreasei + 1, "vps_max_latency_increase_plus1i");
     }
 
+#if ENABLE_ALPHA || ENABLE_MULTIVIEW
+    if (vps.m_numLayers > 1 || vps.m_numViews > 1)
+    {
+        WRITE_CODE(maxLayers - 1, 6, "vps_max_nuh_reserved_zero_layer_id");
+        WRITE_UVLC(vps.m_vpsNumLayerSetsMinus1, "vps_num_layer_sets_minus1");
+        for (int i = 1; i <= vps.m_vpsNumLayerSetsMinus1; i++)
+        {
+#if ENABLE_MULTIVIEW
+            if (vps.m_numViews > 1)
+            {
+                for (int j = 0; j < vps.m_numViews; j++)
+                {
+                    WRITE_FLAG(1, "layer_id_included_flagopsIdxi");
+                }
+            }
+#endif
+#if ENABLE_ALPHA
+            if (vps.m_numLayers > 1)
+            {
+                for (int j = 0; j < vps.m_numLayers; j++)
+                {
+                    WRITE_FLAG(1, "layer_id_included_flagopsIdxi");
+                }
+            }
+#endif
+        }
+    }
+    else
+    {
+        WRITE_CODE(0, 6, "vps_max_nuh_reserved_zero_layer_id");
+        WRITE_UVLC(0, "vps_max_op_sets_minus1");
+    }
+#else
     WRITE_CODE(0, 6, "vps_max_nuh_reserved_zero_layer_id");
-    WRITE_UVLC(0,    "vps_max_op_sets_minus1");
+    WRITE_UVLC(0, "vps_max_op_sets_minus1");
+#endif
+
     WRITE_FLAG(0,    "vps_timing_info_present_flag"); /* we signal timing info in SPS-VUI */
-    WRITE_FLAG(0,    "vps_extension_flag");
+
+#if ENABLE_ALPHA || ENABLE_MULTIVIEW
+    if (vps.m_numLayers > 1 || vps.m_numViews > 1)
+    {
+        WRITE_FLAG(vps.vps_extension_flag, "vps_extension_flag");
+
+        if (vps.vps_extension_flag)
+        {
+            while (m_bitIf->getNumberOfWrittenBits() % X265_BYTE != 0)
+            {
+                WRITE_FLAG(1, "vps_extension_alignment_bit_equal_to_one");
+            }
+
+            WRITE_CODE(vps.ptl.levelIdc, 8, "general_level_idc");
+            if (vps.maxTempSubLayers > 1)
+            {
+                for (int i = 0; i < vps.maxTempSubLayers - 1; i++)
+                {
+                    WRITE_FLAG(0, "sub_layer_profile_present_flagi");
+                    WRITE_FLAG(0, "sub_layer_level_present_flagi");
+                }
+                for (int i = vps.maxTempSubLayers - 1; i < 8; i++)
+                    WRITE_CODE(0, 2, "reserved_zero_2bits");
+            }
+
+            WRITE_FLAG(vps.splitting_flag, "splitting flag");
+            for (int i = 0; i < MAX_VPS_NUM_SCALABILITY_TYPES; i++)
+            {
+                WRITE_FLAG(vps.m_scalabilityMaski, "scalability_maski");
+            }
+            for (int i = 0; i < vps.scalabilityTypes - vps.splitting_flag; i++)
+            {
+                WRITE_CODE(vps.m_dimensionIdLeni - 1, 3, "dimension_id_len_minus1i");
+            }
+            WRITE_FLAG(vps.m_nuhLayerIdPresentFlag, "vps_nuh_layer_id_present_flag");
+            for (int i = 1; i < maxLayers; i++)
+            {
+                if (vps.m_nuhLayerIdPresentFlag)
+                    WRITE_CODE(vps.m_layerIdInNuhi, 6, "layer_id_in_nuhi");
+
+                if (!vps.splitting_flag)
+                {
+                    for (int j = 0; j < vps.scalabilityTypes; j++)
+                    {
+                        uint8_t bits = vps.m_dimensionIdLenj;
+                        WRITE_CODE(vps.m_dimensionIdij, bits, "dimension_idij");
+                    }
+                }
+            }
+            WRITE_CODE(vps.m_viewIdLen, 4, "view_id_len");
+
+#if ENABLE_ALPHA
+            if (vps.m_numLayers > 1)
+            {
+                WRITE_FLAG(0, "direct_dependency_flag10");
+                WRITE_UVLC(0, "num_add_layer_sets");
+                WRITE_FLAG(0, "vps_sub_layers_max_minus1_present_flag");
+                WRITE_FLAG(0, "max_tid_ref_present_flag");
+                WRITE_FLAG(0, "default_ref_layers_active_flag");
+                WRITE_UVLC(2, "vps_num_profile_tier_level_minus1");
+                WRITE_FLAG(1, "vps_profile_present_flag");
+                codeProfileTier(vps.ptl, vps.maxTempSubLayers, 1);
+
+                WRITE_UVLC(0, "num_add_olss");
+                WRITE_CODE(0, 2, "default_output_layer_idc");
+                WRITE_CODE(1, 2, "profile_tier_level_idx i  j ");
+                WRITE_CODE(2, 2, "profile_tier_level_idx i  j ");
+
+                WRITE_UVLC(0, "vps_num_rep_formats_minus1");
+
+                WRITE_CODE(sps.picWidthInLumaSamples, 16, "pic_width_vps_in_luma_samples");
+                WRITE_CODE(sps.picHeightInLumaSamples, 16, "pic_height_vps_in_luma_samples");
+                WRITE_FLAG(1, "chroma_and_bit_depth_vps_present_flag");
+
+                WRITE_CODE(sps.chromaFormatIdc, 2, "chroma_format_vps_idc");
+
+                if (sps.chromaFormatIdc == X265_CSP_I444)
+                    WRITE_FLAG(0, "separate_colour_plane_vps_flag");
+
+                WRITE_CODE(X265_DEPTH - 8, 4, "bit_depth_vps_luma_minus8");
+                WRITE_CODE(X265_DEPTH - 8, 4, "bit_depth_vps_chroma_minus8");
+
+                const Window& conf = sps.conformanceWindow;
+                WRITE_FLAG(conf.bEnabled, "conformance_window_vps_flag");
+                if (conf.bEnabled)
+                {
+                    int hShift = CHROMA_H_SHIFT(sps.chromaFormatIdc), vShift = CHROMA_V_SHIFT(sps.chromaFormatIdc);
+                    WRITE_UVLC(conf.leftOffset >> hShift, "conf_win_vps_left_offset");
+                    WRITE_UVLC(conf.rightOffset >> hShift, "conf_win_vps_right_offset");
+                    WRITE_UVLC(conf.topOffset >> vShift, "conf_win_vps_top_offset");
+                    WRITE_UVLC(conf.bottomOffset >> vShift, "conf_win_vps_bottom_offset");
+                }
+
+                WRITE_FLAG(1, "max_one_active_ref_layer_flag");
+                WRITE_FLAG(0, "vps_poc_lsb_aligned_flag");
+                WRITE_FLAG(1, "poc_lsb_not_present_flag");
+
+                for (int i = 1; i < vps.m_vpsNumLayerSetsMinus1 + 1; i++)
+                {
+                    WRITE_FLAG(vps.maxTempSubLayers > 1, "sub_layer_flag_info_present_flag");
+                    for (int j = 0; j < vps.maxTempSubLayers ; j++)
+                    {
+                        if(j > 0)
+                        WRITE_FLAG(vps.maxTempSubLayers > 1, "sub_layer_dpb_info_present_flag");
+
+                        for(int k = 0; k < vps.m_numLayersInIdListi; k++)
+                            WRITE_UVLC(vps.maxDecPicBufferingj - 1, "vps_max_dec_pic_buffering_minus1i");
+
+                        WRITE_UVLC(vps.numReorderPics0, "vps_num_reorder_picsi");
+                        WRITE_UVLC(vps.maxLatencyIncrease0 + 1, "vps_max_latency_increase_plus1i");
+                    }
+                }
+
+                WRITE_UVLC(0, "direct_dep_type_len_minus2");
+
+                WRITE_FLAG(0, "default_direct_dependency_flag");
+                WRITE_UVLC(0, "vps_non_vui_extension_length");
+                WRITE_FLAG(0, "vps_vui_present_flag");
+                WRITE_FLAG(0, "vps_extension2_flag");
+        }
+#endif
+
+#if ENABLE_MULTIVIEW
+            if (vps.m_numViews > 1)
+            {
+                for (uint8_t i = 0; i < vps.m_numViews; i++)
+                    WRITE_CODE(i, vps.m_viewIdLen, "view_id_vali");
+
+                for (int i = 1; i < vps.m_numViews; i++)
+                {
+                    for (int j = 0; j < i; j++)
+                    {
+                        if (j == 0)
+                            WRITE_FLAG(1, "direct_dependency_flag10");
+                        else
+                            WRITE_FLAG(0, "direct_dependency_flag10");
+                    }
+                }
+                WRITE_FLAG(0, "vps_sub_layers_max_minus1_present_flag");
+                WRITE_FLAG(0, "max_tid_ref_present_flag");
+                WRITE_FLAG(1, "default_ref_layers_active_flag");
​

x265_3.6.tar.gz/source/encoder/entropy.h -> x265_4.0.tar.gz/source/encoder/entropy.h Changed

@@ -141,14 +141,14 @@
     void loadIntraDirModeLuma(const Entropy& src);
     void copyState(const Entropy& other);
 
-    void codeVPS(const VPS& vps);
-    void codeSPS(const SPS& sps, const ScalingList& scalingList, const ProfileTierLevel& ptl);
-    void codePPS( const PPS& pps, bool filerAcross, int iPPSInitQpMinus26 );
-    void codeVUI(const VUI& vui, int maxSubTLayers, bool bEmitVUITimingInfo, bool bEmitVUIHRDInfo);
+    void codeVPS(const VPS& vps, const SPS& sps);
+    void codeSPS(const SPS& sps, const ScalingList& scalingList, const ProfileTierLevel& ptl, int layer = 0);
+    void codePPS( const PPS& pps, bool filerAcross, int iPPSInitQpMinus26, int layer = 0);
+    void codeVUI(const VUI& vui, int maxSubTLayers, bool bEmitVUITimingInfo, bool bEmitVUIHRDInfo, int layer = 0);
     void codeAUD(const Slice& slice);
     void codeHrdParameters(const HRDInfo& hrd, int maxSubTLayers);
 
-    void codeSliceHeader(const Slice& slice, FrameData& encData, uint32_t slice_addr, uint32_t slice_addr_bits, int sliceQp);
+    void codeSliceHeader(const Slice& slice, FrameData& encData, uint32_t slice_addr, uint32_t slice_addr_bits, int sliceQp, int layer = 0);
     void codeSliceHeaderWPPEntryPoints(const uint32_t *substreamSizes, uint32_t numSubStreams, uint32_t maxOffset);
     void codeShortTermRefPicSet(const RPS& rps, int idx);
     void finishSlice()                 { encodeBinTrm(1); finish(); dynamic_cast<Bitstream*>(m_bitIf)->writeByteAlignment(); }
@@ -234,7 +234,7 @@
     void writeEpExGolomb(uint32_t symbol, uint32_t count);
     void writeCoefRemainExGolomb(uint32_t symbol, const uint32_t absGoRice);
 
-    void codeProfileTier(const ProfileTierLevel& ptl, int maxTempSubLayers);
+    void codeProfileTier(const ProfileTierLevel& ptl, int maxTempSubLayers, int layer = 0);
     void codeScalingList(const ScalingList&);
     void codeScalingList(const ScalingList& scalingList, uint32_t sizeId, uint32_t listId);

 
@@ -141,14 +141,14 @@
     void loadIntraDirModeLuma(const Entropy& src);
     void copyState(const Entropy& other);
 
-    void codeVPS(const VPS& vps);
-    void codeSPS(const SPS& sps, const ScalingList& scalingList, const ProfileTierLevel& ptl);
-    void codePPS( const PPS& pps, bool filerAcross, int iPPSInitQpMinus26 );
-    void codeVUI(const VUI& vui, int maxSubTLayers, bool bEmitVUITimingInfo, bool bEmitVUIHRDInfo);
+    void codeVPS(const VPS& vps, const SPS& sps);
+    void codeSPS(const SPS& sps, const ScalingList& scalingList, const ProfileTierLevel& ptl, int layer = 0);
+    void codePPS( const PPS& pps, bool filerAcross, int iPPSInitQpMinus26, int layer = 0);
+    void codeVUI(const VUI& vui, int maxSubTLayers, bool bEmitVUITimingInfo, bool bEmitVUIHRDInfo, int layer = 0);
     void codeAUD(const Slice& slice);
     void codeHrdParameters(const HRDInfo& hrd, int maxSubTLayers);
 
-    void codeSliceHeader(const Slice& slice, FrameData& encData, uint32_t slice_addr, uint32_t slice_addr_bits, int sliceQp);
+    void codeSliceHeader(const Slice& slice, FrameData& encData, uint32_t slice_addr, uint32_t slice_addr_bits, int sliceQp, int layer = 0);
     void codeSliceHeaderWPPEntryPoints(const uint32_t *substreamSizes, uint32_t numSubStreams, uint32_t maxOffset);
     void codeShortTermRefPicSet(const RPS& rps, int idx);
     void finishSlice()                 { encodeBinTrm(1); finish(); dynamic_cast<Bitstream*>(m_bitIf)->writeByteAlignment(); }
@@ -234,7 +234,7 @@
     void writeEpExGolomb(uint32_t symbol, uint32_t count);
     void writeCoefRemainExGolomb(uint32_t symbol, const uint32_t absGoRice);
 
-    void codeProfileTier(const ProfileTierLevel& ptl, int maxTempSubLayers);
+    void codeProfileTier(const ProfileTierLevel& ptl, int maxTempSubLayers, int layer = 0);
     void codeScalingList(const ScalingList&);
     void codeScalingList(const ScalingList& scalingList, uint32_t sizeId, uint32_t listId);
 
​

x265_3.6.tar.gz/source/encoder/frameencoder.cpp -> x265_4.0.tar.gz/source/encoder/frameencoder.cpp Changed

@@ -41,11 +41,9 @@
 
 FrameEncoder::FrameEncoder()
 {
-    m_prevOutputTime = x265_mdate();
     m_reconfigure = false;
     m_isFrameEncoder = true;
     m_threadActive = true;
-    m_slicetypeWaitTime = 0;
     m_activeWorkerCount = 0;
     m_completionCount = 0;
     m_outStreams = NULL;
@@ -56,11 +54,16 @@
     m_rows = NULL;
     m_top = NULL;
     m_param = NULL;
-    m_frame = NULL;
     m_cuGeoms = NULL;
     m_ctuGeomMap = NULL;
     m_localTldIdx = 0;
     memset(&m_rce, 0, sizeof(RateControlEntry));
+    for (int layer = 0; layer < MAX_LAYERS; layer++)
+    {
+        m_prevOutputTimelayer = x265_mdate();
+        m_slicetypeWaitTimelayer = 0;
+        m_framelayer = NULL;
+    }
 }
 
 void FrameEncoder::destroy()
@@ -94,6 +97,7 @@
     X265_FREE(m_ctuGeomMap);
     X265_FREE(m_substreamSizes);
     X265_FREE(m_nr);
+    X265_FREE(m_retFrameBuffer);
 
     m_frameFilter.destroy();
 
@@ -216,6 +220,9 @@
             ok &= !!m_frameEncTF->createRefPicInfo(&m_mcstfRefListi, m_param);
     }
 
+    m_retFrameBuffer = X265_MALLOC(Frame*, m_param->numLayers);
+    for (int layer = 0; layer < m_param->numLayers; layer++)
+        m_retFrameBufferlayer = NULL;
     return ok;
 }
 
@@ -282,14 +289,17 @@
     return true;
 }
 
-bool FrameEncoder::startCompressFrame(Frame* curFrame)
+bool FrameEncoder::startCompressFrame(Frame* curFrameMAX_LAYERS)
 {
-    m_slicetypeWaitTime = x265_mdate() - m_prevOutputTime;
-    m_frame = curFrame;
-    m_sliceType = curFrame->m_lowres.sliceType;
-    curFrame->m_encData->m_frameEncoderID = m_jpId;
-    curFrame->m_encData->m_jobProvider = this;
-    curFrame->m_encData->m_slice->m_mref = m_mref;
+    for (int layer = 0; layer < m_param->numLayers; layer++)
+    {
+        m_slicetypeWaitTimelayer = x265_mdate() - m_prevOutputTimelayer;
+        m_framelayer = curFramelayer;
+        curFramelayer->m_encData->m_frameEncoderID = m_jpId;
+        curFramelayer->m_encData->m_jobProvider = this;
+        curFramelayer->m_encData->m_slice->m_mref = m_mref;
+    }
+    m_sliceType = curFrame0->m_lowres.sliceType;
 
     if (!m_cuGeoms)
     {
@@ -355,15 +365,17 @@
     {
         if (m_param->bCTUInfo)
         {
-            while (!m_frame->m_ctuInfo)
-                m_frame->m_copied.wait();
+            while (!m_frame0->m_ctuInfo)
+                m_frame0->m_copied.wait();
         }
-        if ((m_param->bAnalysisType == AVC_INFO) && !m_param->analysisSave && !m_param->analysisLoad && !(IS_X265_TYPE_I(m_frame->m_lowres.sliceType)))
+        if ((m_param->bAnalysisType == AVC_INFO) && !m_param->analysisSave && !m_param->analysisLoad && !(IS_X265_TYPE_I(m_frame0->m_lowres.sliceType)))
         {
-            while (((m_frame->m_analysisData.interData == NULL && m_frame->m_analysisData.intraData == NULL) || (uint32_t)m_frame->m_poc != m_frame->m_analysisData.poc))
-                m_frame->m_copyMVType.wait();
+            while (((m_frame0->m_analysisData.interData == NULL && m_frame0->m_analysisData.intraData == NULL) || (uint32_t)m_frame0->m_poc != m_frame0->m_analysisData.poc))
+                m_frame0->m_copyMVType.wait();
         }
-        compressFrame();
+
+        for (int layer = 0; layer < m_param->numLayers; layer++)
+            compressFrame(layer);
         m_done.trigger(); /* FrameEncoder::getEncodedPicture() blocks for this event */
         m_enable.wait();
     }
@@ -371,7 +383,7 @@
 
 void FrameEncoder::WeightAnalysis::processTasks(int /* workerThreadId */)
 {
-    Frame* frame = master.m_frame;
+    Frame* frame = master.m_framemaster.m_sLayerId;
     weightAnalyse(*frame->m_encData->m_slice, *frame, *master.m_param);
 }
 
@@ -411,13 +423,13 @@
         memcpy(m_top->m_prevTonemapPayload.payload, payload->payload, payload->payloadSize);
     }
 
-    bool isIDR = m_frame->m_lowres.sliceType == X265_TYPE_IDR;
+    bool isIDR = m_frame0->m_lowres.sliceType == X265_TYPE_IDR;
     return (payloadChange || isIDR);
 }
 
-void FrameEncoder::writeTrailingSEIMessages()
+void FrameEncoder::writeTrailingSEIMessages(int layer)
 {
-    Slice* slice = m_frame->m_encData->m_slice;
+    Slice* slice = m_framelayer->m_encData->m_slice;
     int planes = (m_param->internalCsp != X265_CSP_I400) ? 3 : 1;
     int32_t payloadSize = 0;
 
@@ -444,21 +456,21 @@
     }
 
     m_seiReconPictureDigest.setSize(payloadSize);
-    m_seiReconPictureDigest.writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_SUFFIX_SEI, m_nalList, false);
+    m_seiReconPictureDigest.writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_SUFFIX_SEI, m_nalList, false, layer);
 }
 
-void FrameEncoder::compressFrame()
+void FrameEncoder::compressFrame(int layer)
 {
     ProfileScopeEvent(frameThread);
 
-    m_startCompressTime = x265_mdate();
+    m_startCompressTimelayer = x265_mdate();
     m_totalActiveWorkerCount = 0;
     m_activeWorkerCountSamples = 0;
-    m_totalWorkerElapsedTime = 0;
-    m_totalNoWorkerTime = 0;
+    m_totalWorkerElapsedTimelayer = 0;
+    m_totalNoWorkerTimelayer = 0;
     m_countRowBlocks = 0;
-    m_allRowsAvailableTime = 0;
-    m_stallStartTime = 0;
+    m_allRowsAvailableTimelayer = 0;
+    m_stallStartTimelayer = 0;
 
     m_completionCount = 0;
     memset((void*)m_bAllRowsStop, 0, sizeof(bool) * m_param->maxSlices);
@@ -466,18 +478,19 @@
     m_rowSliceTotalBits0 = 0;
     m_rowSliceTotalBits1 = 0;
 
-    m_SSDY = m_SSDU = m_SSDV = 0;
-    m_ssim = 0;
-    m_ssimCnt = 0;
-    memset(&(m_frame->m_encData->m_frameStats), 0, sizeof(m_frame->m_encData->m_frameStats));
+    m_SSDYlayer = m_SSDUlayer = m_SSDVlayer = 0;
+    m_ssimlayer = 0;
+    m_ssimCntlayer = 0;
+    memset(&(m_framelayer->m_encData->m_frameStats), 0, sizeof(m_framelayer->m_encData->m_frameStats));
+    m_sLayerId = layer;
 
     if (m_param->rc.aqMode != X265_AQ_EDGE && m_param->recursionSkipMode == EDGE_BASED_RSKIP)
     {
-        int height = m_frame->m_fencPic->m_picHeight;
-        int width = m_frame->m_fencPic->m_picWidth;
-        intptr_t stride = m_frame->m_fencPic->m_stride;
+        int height = m_framelayer->m_fencPic->m_picHeight;
+        int width = m_framelayer->m_fencPic->m_picWidth;
+        intptr_t stride = m_framelayer->m_fencPic->m_stride;
 
-        if (!computeEdge(m_frame->m_edgeBitPic, m_frame->m_fencPic->m_picOrg0, NULL, stride, height, width, false, 1))
+        if (!computeEdge(m_framelayer->m_edgeBitPic, m_framelayer->m_fencPic->m_picOrg0, NULL, stride, height, width, false, 1))
         {
             x265_log(m_param, X265_LOG_ERROR, " Failed to compute edge !");
         }
@@ -486,15 +499,15 @@
     /* Emit access unit delimiter unless this is the first frame and the user is
      * not repeating headers (since AUD is supposed to be the first NAL in the access
      * unit) */
-    Slice* slice = m_frame->m_encData->m_slice;
+    Slice* slice = m_framelayer->m_encData->m_slice;
 
-    if (m_param->bEnableEndOfSequence && m_frame->m_lowres.sliceType == X265_TYPE_IDR && m_frame->m_poc)
+    if (m_param->bEnableEndOfSequence && m_framelayer->m_lowres.sliceType == X265_TYPE_IDR && m_framelayer->m_poc)
     {
         m_bs.resetBits();
         m_nalList.serialize(NAL_UNIT_EOS, m_bs);
     }
 
-    if (m_param->bEnableAccessUnitDelimiters && (m_frame->m_poc || m_param->bRepeatHeaders))
+    if (m_param->bEnableAccessUnitDelimiters && (m_framelayer->m_poc || m_param->bRepeatHeaders))
     {
         m_bs.resetBits();
         m_entropyCoder.setBitstream(&m_bs);
@@ -504,7 +517,7 @@

 
@@ -41,11 +41,9 @@
 
 FrameEncoder::FrameEncoder()
 {
-    m_prevOutputTime = x265_mdate();
     m_reconfigure = false;
     m_isFrameEncoder = true;
     m_threadActive = true;
-    m_slicetypeWaitTime = 0;
     m_activeWorkerCount = 0;
     m_completionCount = 0;
     m_outStreams = NULL;
@@ -56,11 +54,16 @@
     m_rows = NULL;
     m_top = NULL;
     m_param = NULL;
-    m_frame = NULL;
     m_cuGeoms = NULL;
     m_ctuGeomMap = NULL;
     m_localTldIdx = 0;
     memset(&m_rce, 0, sizeof(RateControlEntry));
+    for (int layer = 0; layer < MAX_LAYERS; layer++)
+    {
+        m_prevOutputTimelayer = x265_mdate();
+        m_slicetypeWaitTimelayer = 0;
+        m_framelayer = NULL;
+    }
 }
 
 void FrameEncoder::destroy()
@@ -94,6 +97,7 @@
     X265_FREE(m_ctuGeomMap);
     X265_FREE(m_substreamSizes);
     X265_FREE(m_nr);
+    X265_FREE(m_retFrameBuffer);
 
     m_frameFilter.destroy();
 
@@ -216,6 +220,9 @@
             ok &= !!m_frameEncTF->createRefPicInfo(&m_mcstfRefListi, m_param);
     }
 
+    m_retFrameBuffer = X265_MALLOC(Frame*, m_param->numLayers);
+    for (int layer = 0; layer < m_param->numLayers; layer++)
+        m_retFrameBufferlayer = NULL;
     return ok;
 }
 
@@ -282,14 +289,17 @@
     return true;
 }
 
-bool FrameEncoder::startCompressFrame(Frame* curFrame)
+bool FrameEncoder::startCompressFrame(Frame* curFrameMAX_LAYERS)
 {
-    m_slicetypeWaitTime = x265_mdate() - m_prevOutputTime;
-    m_frame = curFrame;
-    m_sliceType = curFrame->m_lowres.sliceType;
-    curFrame->m_encData->m_frameEncoderID = m_jpId;
-    curFrame->m_encData->m_jobProvider = this;
-    curFrame->m_encData->m_slice->m_mref = m_mref;
+    for (int layer = 0; layer < m_param->numLayers; layer++)
+    {
+        m_slicetypeWaitTimelayer = x265_mdate() - m_prevOutputTimelayer;
+        m_framelayer = curFramelayer;
+        curFramelayer->m_encData->m_frameEncoderID = m_jpId;
+        curFramelayer->m_encData->m_jobProvider = this;
+        curFramelayer->m_encData->m_slice->m_mref = m_mref;
+    }
+    m_sliceType = curFrame0->m_lowres.sliceType;
 
     if (!m_cuGeoms)
     {
@@ -355,15 +365,17 @@
     {
         if (m_param->bCTUInfo)
         {
-            while (!m_frame->m_ctuInfo)
-                m_frame->m_copied.wait();
+            while (!m_frame0->m_ctuInfo)
+                m_frame0->m_copied.wait();
         }
-        if ((m_param->bAnalysisType == AVC_INFO) && !m_param->analysisSave && !m_param->analysisLoad && !(IS_X265_TYPE_I(m_frame->m_lowres.sliceType)))
+        if ((m_param->bAnalysisType == AVC_INFO) && !m_param->analysisSave && !m_param->analysisLoad && !(IS_X265_TYPE_I(m_frame0->m_lowres.sliceType)))
         {
-            while (((m_frame->m_analysisData.interData == NULL && m_frame->m_analysisData.intraData == NULL) || (uint32_t)m_frame->m_poc != m_frame->m_analysisData.poc))
-                m_frame->m_copyMVType.wait();
+            while (((m_frame0->m_analysisData.interData == NULL && m_frame0->m_analysisData.intraData == NULL) || (uint32_t)m_frame0->m_poc != m_frame0->m_analysisData.poc))
+                m_frame0->m_copyMVType.wait();
         }
-        compressFrame();
+
+        for (int layer = 0; layer < m_param->numLayers; layer++)
+            compressFrame(layer);
         m_done.trigger(); /* FrameEncoder::getEncodedPicture() blocks for this event */
         m_enable.wait();
     }
@@ -371,7 +383,7 @@
 
 void FrameEncoder::WeightAnalysis::processTasks(int /* workerThreadId */)
 {
-    Frame* frame = master.m_frame;
+    Frame* frame = master.m_framemaster.m_sLayerId;
     weightAnalyse(*frame->m_encData->m_slice, *frame, *master.m_param);
 }
 
@@ -411,13 +423,13 @@
         memcpy(m_top->m_prevTonemapPayload.payload, payload->payload, payload->payloadSize);
     }
 
-    bool isIDR = m_frame->m_lowres.sliceType == X265_TYPE_IDR;
+    bool isIDR = m_frame0->m_lowres.sliceType == X265_TYPE_IDR;
     return (payloadChange || isIDR);
 }
 
-void FrameEncoder::writeTrailingSEIMessages()
+void FrameEncoder::writeTrailingSEIMessages(int layer)
 {
-    Slice* slice = m_frame->m_encData->m_slice;
+    Slice* slice = m_framelayer->m_encData->m_slice;
     int planes = (m_param->internalCsp != X265_CSP_I400) ? 3 : 1;
     int32_t payloadSize = 0;
 
@@ -444,21 +456,21 @@
     }
 
     m_seiReconPictureDigest.setSize(payloadSize);
-    m_seiReconPictureDigest.writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_SUFFIX_SEI, m_nalList, false);
+    m_seiReconPictureDigest.writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_SUFFIX_SEI, m_nalList, false, layer);
 }
 
-void FrameEncoder::compressFrame()
+void FrameEncoder::compressFrame(int layer)
 {
     ProfileScopeEvent(frameThread);
 
-    m_startCompressTime = x265_mdate();
+    m_startCompressTimelayer = x265_mdate();
     m_totalActiveWorkerCount = 0;
     m_activeWorkerCountSamples = 0;
-    m_totalWorkerElapsedTime = 0;
-    m_totalNoWorkerTime = 0;
+    m_totalWorkerElapsedTimelayer = 0;
+    m_totalNoWorkerTimelayer = 0;
     m_countRowBlocks = 0;
-    m_allRowsAvailableTime = 0;
-    m_stallStartTime = 0;
+    m_allRowsAvailableTimelayer = 0;
+    m_stallStartTimelayer = 0;
 
     m_completionCount = 0;
     memset((void*)m_bAllRowsStop, 0, sizeof(bool) * m_param->maxSlices);
@@ -466,18 +478,19 @@
     m_rowSliceTotalBits0 = 0;
     m_rowSliceTotalBits1 = 0;
 
-    m_SSDY = m_SSDU = m_SSDV = 0;
-    m_ssim = 0;
-    m_ssimCnt = 0;
-    memset(&(m_frame->m_encData->m_frameStats), 0, sizeof(m_frame->m_encData->m_frameStats));
+    m_SSDYlayer = m_SSDUlayer = m_SSDVlayer = 0;
+    m_ssimlayer = 0;
+    m_ssimCntlayer = 0;
+    memset(&(m_framelayer->m_encData->m_frameStats), 0, sizeof(m_framelayer->m_encData->m_frameStats));
+    m_sLayerId = layer;
 
     if (m_param->rc.aqMode != X265_AQ_EDGE && m_param->recursionSkipMode == EDGE_BASED_RSKIP)
     {
-        int height = m_frame->m_fencPic->m_picHeight;
-        int width = m_frame->m_fencPic->m_picWidth;
-        intptr_t stride = m_frame->m_fencPic->m_stride;
+        int height = m_framelayer->m_fencPic->m_picHeight;
+        int width = m_framelayer->m_fencPic->m_picWidth;
+        intptr_t stride = m_framelayer->m_fencPic->m_stride;
 
-        if (!computeEdge(m_frame->m_edgeBitPic, m_frame->m_fencPic->m_picOrg0, NULL, stride, height, width, false, 1))
+        if (!computeEdge(m_framelayer->m_edgeBitPic, m_framelayer->m_fencPic->m_picOrg0, NULL, stride, height, width, false, 1))
         {
             x265_log(m_param, X265_LOG_ERROR, " Failed to compute edge !");
         }
@@ -486,15 +499,15 @@
     /* Emit access unit delimiter unless this is the first frame and the user is
      * not repeating headers (since AUD is supposed to be the first NAL in the access
      * unit) */
-    Slice* slice = m_frame->m_encData->m_slice;
+    Slice* slice = m_framelayer->m_encData->m_slice;
 
-    if (m_param->bEnableEndOfSequence && m_frame->m_lowres.sliceType == X265_TYPE_IDR && m_frame->m_poc)
+    if (m_param->bEnableEndOfSequence && m_framelayer->m_lowres.sliceType == X265_TYPE_IDR && m_framelayer->m_poc)
     {
         m_bs.resetBits();
         m_nalList.serialize(NAL_UNIT_EOS, m_bs);
     }
 
-    if (m_param->bEnableAccessUnitDelimiters && (m_frame->m_poc || m_param->bRepeatHeaders))
+    if (m_param->bEnableAccessUnitDelimiters && (m_framelayer->m_poc || m_param->bRepeatHeaders))
     {
         m_bs.resetBits();
         m_entropyCoder.setBitstream(&m_bs);
@@ -504,7 +517,7 @@
​

x265_3.6.tar.gz/source/encoder/frameencoder.h -> x265_4.0.tar.gz/source/encoder/frameencoder.h Changed

@@ -156,12 +156,12 @@
     void destroy();
 
     /* triggers encode of a new frame by the worker thread */
-    bool startCompressFrame(Frame* curFrame);
+    bool startCompressFrame(Frame* curFrameMAX_LAYERS);
 
     /* blocks until worker thread is done, returns access unit */
-    Frame *getEncodedPicture(NALList& list);
+    Frame **getEncodedPicture(NALList& list);
 
-    void initDecodedPictureHashSEI(int row, int cuAddr, int height);
+    void initDecodedPictureHashSEI(int row, int cuAddr, int height, int layer);
 
     Event                    m_enable;
     Event                    m_done;
@@ -190,34 +190,35 @@
     RateControlEntry         m_rce;
     SEIDecodedPictureHash    m_seiReconPictureDigest;
 
-    uint64_t                 m_SSDY;
-    uint64_t                 m_SSDU;
-    uint64_t                 m_SSDV;
-    double                   m_ssim;
-    uint64_t                 m_accessUnitBits;
-    uint32_t                 m_ssimCnt;
+    uint64_t                 m_SSDYMAX_LAYERS;
+    uint64_t                 m_SSDUMAX_LAYERS;
+    uint64_t                 m_SSDVMAX_LAYERS;
+    double                   m_ssimMAX_LAYERS;
+    uint64_t                 m_accessUnitBitsMAX_LAYERS;
+    uint32_t                 m_ssimCntMAX_LAYERS;
 
     volatile int             m_activeWorkerCount;        // count of workers currently encoding or filtering CTUs
     volatile int             m_totalActiveWorkerCount;   // sum of m_activeWorkerCount sampled at end of each CTU
     volatile int             m_activeWorkerCountSamples; // count of times m_activeWorkerCount was sampled (think vbv restarts)
     volatile int             m_countRowBlocks;           // count of workers forced to abandon a row because of top dependency
-    int64_t                  m_startCompressTime;        // timestamp when frame encoder is given a frame
-    int64_t                  m_row0WaitTime;             // timestamp when row 0 is allowed to start
-    int64_t                  m_allRowsAvailableTime;     // timestamp when all reference dependencies are resolved
-    int64_t                  m_endCompressTime;          // timestamp after all CTUs are compressed
-    int64_t                  m_endFrameTime;             // timestamp after RCEnd, NR updates, etc
-    int64_t                  m_stallStartTime;           // timestamp when worker count becomes 0
-    int64_t                  m_prevOutputTime;           // timestamp when prev frame was retrieved by API thread
-    int64_t                  m_slicetypeWaitTime;        // total elapsed time waiting for decided frame
-    int64_t                  m_totalWorkerElapsedTime;   // total elapsed time spent by worker threads processing CTUs
-    int64_t                  m_totalNoWorkerTime;        // total elapsed time without any active worker threads
+    int64_t                  m_startCompressTimeMAX_LAYERS;        // timestamp when frame encoder is given a frame
+    int64_t                  m_row0WaitTimeMAX_LAYERS;             // timestamp when row 0 is allowed to start
+    int64_t                  m_allRowsAvailableTimeMAX_LAYERS;     // timestamp when all reference dependencies are resolved
+    int64_t                  m_endCompressTimeMAX_LAYERS;          // timestamp after all CTUs are compressed
+    int64_t                  m_endFrameTimeMAX_LAYERS;             // timestamp after RCEnd, NR updates, etc
+    int64_t                  m_stallStartTimeMAX_LAYERS;           // timestamp when worker count becomes 0
+    int64_t                  m_prevOutputTimeMAX_LAYERS;           // timestamp when prev frame was retrieved by API thread
+    int64_t                  m_slicetypeWaitTimeMAX_LAYERS;        // total elapsed time waiting for decided frame
+    int64_t                  m_totalWorkerElapsedTimeMAX_LAYERS;   // total elapsed time spent by worker threads processing CTUs
+    int64_t                  m_totalNoWorkerTimeMAX_LAYERS;        // total elapsed time without any active worker threads
 #if DETAILED_CU_STATS
     CUStats                  m_cuStats;
 #endif
 
     Encoder*                 m_top;
     x265_param*              m_param;
-    Frame*                   m_frame;
+    Frame*                   m_frameMAX_LAYERS;
+    Frame**                  m_retFrameBuffer;
     NoiseReduction*          m_nr;
     ThreadLocalData*         m_tld; /* for --no-wpp */
     Bitstream*               m_outStreams;
@@ -238,6 +239,8 @@
     TemporalFilter*          m_frameEncTF;
     TemporalFilterRefPicInfo m_mcstfRefListMAX_MCSTF_TEMPORAL_WINDOW_LENGTH;
 
+    int                      m_sLayerId;
+
     class WeightAnalysis : public BondedTaskGroup
     {
     public:
@@ -258,20 +261,20 @@
     bool initializeGeoms();
 
     /* analyze / compress frame, can be run in parallel within reference constraints */
-    void compressFrame();
+    void compressFrame(int layer);
 
     /* called by compressFrame to generate final per-row bitstreams */
-    void encodeSlice(uint32_t sliceAddr);
+    void encodeSlice(uint32_t sliceAddr, int layer);
 
     void threadMain();
     int  collectCTUStatistics(const CUData& ctu, FrameStats* frameLog);
     void noiseReductionUpdate();
-    void writeTrailingSEIMessages();
+    void writeTrailingSEIMessages(int layer);
     bool writeToneMapInfo(x265_sei_payload *payload);
 
     /* Called by WaveFront::findJob() */
-    virtual void processRow(int row, int threadId);
-    virtual void processRowEncoder(int row, ThreadLocalData& tld);
+    virtual void processRow(int row, int threadId, int layer);
+    virtual void processRowEncoder(int row, ThreadLocalData& tld, int layer);
 
     void enqueueRowEncoder(int row) { WaveFront::enqueueRow(row * 2 + 0); }
     void enqueueRowFilter(int row)  { WaveFront::enqueueRow(row * 2 + 1); }
@@ -280,8 +283,8 @@
 #if ENABLE_LIBVMAF
     void vmafFrameLevelScore();
 #endif
-    void collectDynDataFrame();
-    void computeAvgTrainingData();
+    void collectDynDataFrame(int layer);
+    void computeAvgTrainingData(int layer);
     void collectDynDataRow(CUData& ctu, FrameStats* rowStats);    
     void readModel(FilmGrainCharacteristics* m_filmGrain, FILE* filmgrain);
 };

 
@@ -156,12 +156,12 @@
     void destroy();
 
     /* triggers encode of a new frame by the worker thread */
-    bool startCompressFrame(Frame* curFrame);
+    bool startCompressFrame(Frame* curFrameMAX_LAYERS);
 
     /* blocks until worker thread is done, returns access unit */
-    Frame *getEncodedPicture(NALList& list);
+    Frame **getEncodedPicture(NALList& list);
 
-    void initDecodedPictureHashSEI(int row, int cuAddr, int height);
+    void initDecodedPictureHashSEI(int row, int cuAddr, int height, int layer);
 
     Event                    m_enable;
     Event                    m_done;
@@ -190,34 +190,35 @@
     RateControlEntry         m_rce;
     SEIDecodedPictureHash    m_seiReconPictureDigest;
 
-    uint64_t                 m_SSDY;
-    uint64_t                 m_SSDU;
-    uint64_t                 m_SSDV;
-    double                   m_ssim;
-    uint64_t                 m_accessUnitBits;
-    uint32_t                 m_ssimCnt;
+    uint64_t                 m_SSDYMAX_LAYERS;
+    uint64_t                 m_SSDUMAX_LAYERS;
+    uint64_t                 m_SSDVMAX_LAYERS;
+    double                   m_ssimMAX_LAYERS;
+    uint64_t                 m_accessUnitBitsMAX_LAYERS;
+    uint32_t                 m_ssimCntMAX_LAYERS;
 
     volatile int             m_activeWorkerCount;        // count of workers currently encoding or filtering CTUs
     volatile int             m_totalActiveWorkerCount;   // sum of m_activeWorkerCount sampled at end of each CTU
     volatile int             m_activeWorkerCountSamples; // count of times m_activeWorkerCount was sampled (think vbv restarts)
     volatile int             m_countRowBlocks;           // count of workers forced to abandon a row because of top dependency
-    int64_t                  m_startCompressTime;        // timestamp when frame encoder is given a frame
-    int64_t                  m_row0WaitTime;             // timestamp when row 0 is allowed to start
-    int64_t                  m_allRowsAvailableTime;     // timestamp when all reference dependencies are resolved
-    int64_t                  m_endCompressTime;          // timestamp after all CTUs are compressed
-    int64_t                  m_endFrameTime;             // timestamp after RCEnd, NR updates, etc
-    int64_t                  m_stallStartTime;           // timestamp when worker count becomes 0
-    int64_t                  m_prevOutputTime;           // timestamp when prev frame was retrieved by API thread
-    int64_t                  m_slicetypeWaitTime;        // total elapsed time waiting for decided frame
-    int64_t                  m_totalWorkerElapsedTime;   // total elapsed time spent by worker threads processing CTUs
-    int64_t                  m_totalNoWorkerTime;        // total elapsed time without any active worker threads
+    int64_t                  m_startCompressTimeMAX_LAYERS;        // timestamp when frame encoder is given a frame
+    int64_t                  m_row0WaitTimeMAX_LAYERS;             // timestamp when row 0 is allowed to start
+    int64_t                  m_allRowsAvailableTimeMAX_LAYERS;     // timestamp when all reference dependencies are resolved
+    int64_t                  m_endCompressTimeMAX_LAYERS;          // timestamp after all CTUs are compressed
+    int64_t                  m_endFrameTimeMAX_LAYERS;             // timestamp after RCEnd, NR updates, etc
+    int64_t                  m_stallStartTimeMAX_LAYERS;           // timestamp when worker count becomes 0
+    int64_t                  m_prevOutputTimeMAX_LAYERS;           // timestamp when prev frame was retrieved by API thread
+    int64_t                  m_slicetypeWaitTimeMAX_LAYERS;        // total elapsed time waiting for decided frame
+    int64_t                  m_totalWorkerElapsedTimeMAX_LAYERS;   // total elapsed time spent by worker threads processing CTUs
+    int64_t                  m_totalNoWorkerTimeMAX_LAYERS;        // total elapsed time without any active worker threads
 #if DETAILED_CU_STATS
     CUStats                  m_cuStats;
 #endif
 
     Encoder*                 m_top;
     x265_param*              m_param;
-    Frame*                   m_frame;
+    Frame*                   m_frameMAX_LAYERS;
+    Frame**                  m_retFrameBuffer;
     NoiseReduction*          m_nr;
     ThreadLocalData*         m_tld; /* for --no-wpp */
     Bitstream*               m_outStreams;
@@ -238,6 +239,8 @@
     TemporalFilter*          m_frameEncTF;
     TemporalFilterRefPicInfo m_mcstfRefListMAX_MCSTF_TEMPORAL_WINDOW_LENGTH;
 
+    int                      m_sLayerId;
+
     class WeightAnalysis : public BondedTaskGroup
     {
     public:
@@ -258,20 +261,20 @@
     bool initializeGeoms();
 
     /* analyze / compress frame, can be run in parallel within reference constraints */
-    void compressFrame();
+    void compressFrame(int layer);
 
     /* called by compressFrame to generate final per-row bitstreams */
-    void encodeSlice(uint32_t sliceAddr);
+    void encodeSlice(uint32_t sliceAddr, int layer);
 
     void threadMain();
     int  collectCTUStatistics(const CUData& ctu, FrameStats* frameLog);
     void noiseReductionUpdate();
-    void writeTrailingSEIMessages();
+    void writeTrailingSEIMessages(int layer);
     bool writeToneMapInfo(x265_sei_payload *payload);
 
     /* Called by WaveFront::findJob() */
-    virtual void processRow(int row, int threadId);
-    virtual void processRowEncoder(int row, ThreadLocalData& tld);
+    virtual void processRow(int row, int threadId, int layer);
+    virtual void processRowEncoder(int row, ThreadLocalData& tld, int layer);
 
     void enqueueRowEncoder(int row) { WaveFront::enqueueRow(row * 2 + 0); }
     void enqueueRowFilter(int row)  { WaveFront::enqueueRow(row * 2 + 1); }
@@ -280,8 +283,8 @@
 #if ENABLE_LIBVMAF
     void vmafFrameLevelScore();
 #endif
-    void collectDynDataFrame();
-    void computeAvgTrainingData();
+    void collectDynDataFrame(int layer);
+    void computeAvgTrainingData(int layer);
     void collectDynDataRow(CUData& ctu, FrameStats* rowStats);    
     void readModel(FilmGrainCharacteristics* m_filmGrain, FILE* filmgrain);
 };
​

x265_3.6.tar.gz/source/encoder/framefilter.cpp -> x265_4.0.tar.gz/source/encoder/framefilter.cpp Changed

@@ -256,7 +256,7 @@
     const int size = cu->m_log2CUSizeabsPartIdx - 2;
     const uint32_t cuAddr = cu->m_cuAddr;
 
-    PicYuv* reconPic = frame.m_reconPic;
+    PicYuv* reconPic = frame.m_reconPic0;
     PicYuv* fencPic  = frame.m_fencPic;
 
     pixel* dst = reconPic->getLumaAddr(cuAddr, absPartIdx);
@@ -337,7 +337,7 @@
 
         uint32_t cuAddr = m_rowAddr + col;
         const CUData* ctu = m_encData->getPicCTU(cuAddr);
-        assert(m_frameFilter->m_frame->m_reconPic == m_encData->m_reconPic);
+        assert(m_frameFilter->m_frame->m_reconPic0 == m_encData->m_reconPic0);
         origCUSampleRestoration(ctu, cuGeomsctuGeomMapcuAddr, *m_frameFilter->m_frame);
     }
 }
@@ -352,7 +352,7 @@
     if ((col != 0) & (col != m_frameFilter->m_numCols - 1) & (m_row != 0) & (m_row != m_frameFilter->m_numRows - 1))
         return;
 
-    PicYuv *reconPic = m_frameFilter->m_frame->m_reconPic;
+    PicYuv *reconPic = m_frameFilter->m_frame->m_reconPic0;
     const uint32_t lineStartCUAddr = m_rowAddr + col;
     const int realH = getCUHeight();
     const int realW = m_frameFilter->getCUWidth(col);
@@ -441,7 +441,7 @@
     SAOParam* saoParam = m_encData->m_saoParam;
     const CUGeom* cuGeoms = m_frameFilter->m_frameEncoder->m_cuGeoms;
     const uint32_t* ctuGeomMap = m_frameFilter->m_frameEncoder->m_ctuGeomMap;
-    PicYuv* reconPic = m_encData->m_reconPic;
+    PicYuv* reconPic = m_encData->m_reconPic0;
     const int colStart = m_lastCol.get();
     const int numCols = m_frameFilter->m_numCols;
     // TODO: Waiting previous row finish or simple clip on it?
@@ -561,7 +561,7 @@
     }
 }
 
-void FrameFilter::processRow(int row)
+void FrameFilter::processRow(int row, int layer)
 {
     ProfileScopeEvent(filterCTURow);
 
@@ -572,7 +572,7 @@
 
     if (!m_param->bEnableLoopFilter && !m_useSao)
     {
-        processPostRow(row);
+        processPostRow(row, layer);
         return;
     }
     FrameData& encData = *m_frame->m_encData;
@@ -616,7 +616,7 @@
 
     // this row of CTUs has been encoded
     if (!ctu->m_bFirstRowInSlice)
-        processPostRow(row - 1);
+        processPostRow(row - 1, layer);
 
     // NOTE: slices parallelism will be execute out-of-order
     int numRowFinished = 0;
@@ -648,12 +648,12 @@
     }
 
     if (ctu->m_bLastRowInSlice)
-        processPostRow(row);
+        processPostRow(row, layer);
 }
 
-void FrameFilter::processPostRow(int row)
+void FrameFilter::processPostRow(int row, int layer)
 {
-    PicYuv *reconPic = m_frame->m_reconPic;
+    PicYuv *reconPic = m_frame->m_reconPic0;
     const uint32_t numCols = m_frame->m_encData->m_slice->m_sps->numCuInWidth;
     const uint32_t lineStartCUAddr = row * numCols;
 
@@ -673,7 +673,7 @@
         uint32_t height = m_parallelFilterrow.getCUHeight();
 
         uint64_t ssdY = m_frameEncoder->m_top->computeSSD(fencPic->getLumaAddr(cuAddr), reconPic->getLumaAddr(cuAddr), stride, width, height, m_param);
-        m_frameEncoder->m_SSDY += ssdY;
+        m_frameEncoder->m_SSDYlayer += ssdY;
 
         if (m_param->internalCsp != X265_CSP_I400)
         {
@@ -684,8 +684,8 @@
             uint64_t ssdU = m_frameEncoder->m_top->computeSSD(fencPic->getCbAddr(cuAddr), reconPic->getCbAddr(cuAddr), stride, width, height, m_param);
             uint64_t ssdV = m_frameEncoder->m_top->computeSSD(fencPic->getCrAddr(cuAddr), reconPic->getCrAddr(cuAddr), stride, width, height, m_param);
 
-            m_frameEncoder->m_SSDU += ssdU;
-            m_frameEncoder->m_SSDV += ssdV;
+            m_frameEncoder->m_SSDUlayer += ssdU;
+            m_frameEncoder->m_SSDVlayer += ssdV;
         }
     }
 
@@ -705,15 +705,15 @@
         /* SSIM is done for each row in blocks of 4x4 . The First blocks are offset by 2 pixels to the right
         * to avoid alignment of ssim blocks with DCT blocks. */
         minPixY += bStart ? 2 : -6;
-        m_frameEncoder->m_ssim += calculateSSIM(rec + 2 + minPixY * stride1, stride1, fenc + 2 + minPixY * stride2, stride2,
+        m_frameEncoder->m_ssimlayer += calculateSSIM(rec + 2 + minPixY * stride1, stride1, fenc + 2 + minPixY * stride2, stride2,
                                                 m_param->sourceWidth - 2, maxPixY - minPixY, m_ssimBuf, ssim_cnt);
-        m_frameEncoder->m_ssimCnt += ssim_cnt;
+        m_frameEncoder->m_ssimCntlayer += ssim_cnt;
     }
 
     if (m_param->maxSlices == 1)
     {
         uint32_t height = m_parallelFilterrow.getCUHeight();
-        m_frameEncoder->initDecodedPictureHashSEI(row, cuAddr, height);
+        m_frameEncoder->initDecodedPictureHashSEI(row, cuAddr, height, layer);
     } // end of (m_param->maxSlices == 1)
 
     if (ATOMIC_INC(&m_frameEncoder->m_completionCount) == 2 * (int)m_frameEncoder->m_numRows)
@@ -737,7 +737,7 @@
             }
         }
 
-        int stride = (int)m_frame->m_reconPic->m_stride;
+        int stride = (int)m_frame->m_reconPic0->m_stride;
         int padX = m_param->maxCUSize + 32;
         int padY = m_param->maxCUSize + 16;
         int numCuInHeight = m_frame->m_encData->m_slice->m_sps->numCuInHeight;
@@ -763,7 +763,7 @@
 
         for (int y = startRow; y < height; y++)
         {
-            pixel    *pix = m_frame->m_reconPic->m_picOrg0 + y * stride - padX;
+            pixel    *pix = m_frame->m_reconPic0->m_picOrg0 + y * stride - padX;
             uint32_t *sum32x32 = m_frame->m_encData->m_meIntegral0 + (y + 1) * stride - padX;
             uint32_t *sum32x24 = m_frame->m_encData->m_meIntegral1 + (y + 1) * stride - padX;
             uint32_t *sum32x8 = m_frame->m_encData->m_meIntegral2 + (y + 1) * stride - padX;

 
@@ -256,7 +256,7 @@
     const int size = cu->m_log2CUSizeabsPartIdx - 2;
     const uint32_t cuAddr = cu->m_cuAddr;
 
-    PicYuv* reconPic = frame.m_reconPic;
+    PicYuv* reconPic = frame.m_reconPic0;
     PicYuv* fencPic  = frame.m_fencPic;
 
     pixel* dst = reconPic->getLumaAddr(cuAddr, absPartIdx);
@@ -337,7 +337,7 @@
 
         uint32_t cuAddr = m_rowAddr + col;
         const CUData* ctu = m_encData->getPicCTU(cuAddr);
-        assert(m_frameFilter->m_frame->m_reconPic == m_encData->m_reconPic);
+        assert(m_frameFilter->m_frame->m_reconPic0 == m_encData->m_reconPic0);
         origCUSampleRestoration(ctu, cuGeomsctuGeomMapcuAddr, *m_frameFilter->m_frame);
     }
 }
@@ -352,7 +352,7 @@
     if ((col != 0) & (col != m_frameFilter->m_numCols - 1) & (m_row != 0) & (m_row != m_frameFilter->m_numRows - 1))
         return;
 
-    PicYuv *reconPic = m_frameFilter->m_frame->m_reconPic;
+    PicYuv *reconPic = m_frameFilter->m_frame->m_reconPic0;
     const uint32_t lineStartCUAddr = m_rowAddr + col;
     const int realH = getCUHeight();
     const int realW = m_frameFilter->getCUWidth(col);
@@ -441,7 +441,7 @@
     SAOParam* saoParam = m_encData->m_saoParam;
     const CUGeom* cuGeoms = m_frameFilter->m_frameEncoder->m_cuGeoms;
     const uint32_t* ctuGeomMap = m_frameFilter->m_frameEncoder->m_ctuGeomMap;
-    PicYuv* reconPic = m_encData->m_reconPic;
+    PicYuv* reconPic = m_encData->m_reconPic0;
     const int colStart = m_lastCol.get();
     const int numCols = m_frameFilter->m_numCols;
     // TODO: Waiting previous row finish or simple clip on it?
@@ -561,7 +561,7 @@
     }
 }
 
-void FrameFilter::processRow(int row)
+void FrameFilter::processRow(int row, int layer)
 {
     ProfileScopeEvent(filterCTURow);
 
@@ -572,7 +572,7 @@
 
     if (!m_param->bEnableLoopFilter && !m_useSao)
     {
-        processPostRow(row);
+        processPostRow(row, layer);
         return;
     }
     FrameData& encData = *m_frame->m_encData;
@@ -616,7 +616,7 @@
 
     // this row of CTUs has been encoded
     if (!ctu->m_bFirstRowInSlice)
-        processPostRow(row - 1);
+        processPostRow(row - 1, layer);
 
     // NOTE: slices parallelism will be execute out-of-order
     int numRowFinished = 0;
@@ -648,12 +648,12 @@
     }
 
     if (ctu->m_bLastRowInSlice)
-        processPostRow(row);
+        processPostRow(row, layer);
 }
 
-void FrameFilter::processPostRow(int row)
+void FrameFilter::processPostRow(int row, int layer)
 {
-    PicYuv *reconPic = m_frame->m_reconPic;
+    PicYuv *reconPic = m_frame->m_reconPic0;
     const uint32_t numCols = m_frame->m_encData->m_slice->m_sps->numCuInWidth;
     const uint32_t lineStartCUAddr = row * numCols;
 
@@ -673,7 +673,7 @@
         uint32_t height = m_parallelFilterrow.getCUHeight();
 
         uint64_t ssdY = m_frameEncoder->m_top->computeSSD(fencPic->getLumaAddr(cuAddr), reconPic->getLumaAddr(cuAddr), stride, width, height, m_param);
-        m_frameEncoder->m_SSDY += ssdY;
+        m_frameEncoder->m_SSDYlayer += ssdY;
 
         if (m_param->internalCsp != X265_CSP_I400)
         {
@@ -684,8 +684,8 @@
             uint64_t ssdU = m_frameEncoder->m_top->computeSSD(fencPic->getCbAddr(cuAddr), reconPic->getCbAddr(cuAddr), stride, width, height, m_param);
             uint64_t ssdV = m_frameEncoder->m_top->computeSSD(fencPic->getCrAddr(cuAddr), reconPic->getCrAddr(cuAddr), stride, width, height, m_param);
 
-            m_frameEncoder->m_SSDU += ssdU;
-            m_frameEncoder->m_SSDV += ssdV;
+            m_frameEncoder->m_SSDUlayer += ssdU;
+            m_frameEncoder->m_SSDVlayer += ssdV;
         }
     }
 
@@ -705,15 +705,15 @@
         /* SSIM is done for each row in blocks of 4x4 . The First blocks are offset by 2 pixels to the right
         * to avoid alignment of ssim blocks with DCT blocks. */
         minPixY += bStart ? 2 : -6;
-        m_frameEncoder->m_ssim += calculateSSIM(rec + 2 + minPixY * stride1, stride1, fenc + 2 + minPixY * stride2, stride2,
+        m_frameEncoder->m_ssimlayer += calculateSSIM(rec + 2 + minPixY * stride1, stride1, fenc + 2 + minPixY * stride2, stride2,
                                                 m_param->sourceWidth - 2, maxPixY - minPixY, m_ssimBuf, ssim_cnt);
-        m_frameEncoder->m_ssimCnt += ssim_cnt;
+        m_frameEncoder->m_ssimCntlayer += ssim_cnt;
     }
 
     if (m_param->maxSlices == 1)
     {
         uint32_t height = m_parallelFilterrow.getCUHeight();
-        m_frameEncoder->initDecodedPictureHashSEI(row, cuAddr, height);
+        m_frameEncoder->initDecodedPictureHashSEI(row, cuAddr, height, layer);
     } // end of (m_param->maxSlices == 1)
 
     if (ATOMIC_INC(&m_frameEncoder->m_completionCount) == 2 * (int)m_frameEncoder->m_numRows)
@@ -737,7 +737,7 @@
             }
         }
 
-        int stride = (int)m_frame->m_reconPic->m_stride;
+        int stride = (int)m_frame->m_reconPic0->m_stride;
         int padX = m_param->maxCUSize + 32;
         int padY = m_param->maxCUSize + 16;
         int numCuInHeight = m_frame->m_encData->m_slice->m_sps->numCuInHeight;
@@ -763,7 +763,7 @@
 
         for (int y = startRow; y < height; y++)
         {
-            pixel    *pix = m_frame->m_reconPic->m_picOrg0 + y * stride - padX;
+            pixel    *pix = m_frame->m_reconPic0->m_picOrg0 + y * stride - padX;
             uint32_t *sum32x32 = m_frame->m_encData->m_meIntegral0 + (y + 1) * stride - padX;
             uint32_t *sum32x24 = m_frame->m_encData->m_meIntegral1 + (y + 1) * stride - padX;
             uint32_t *sum32x8 = m_frame->m_encData->m_meIntegral2 + (y + 1) * stride - padX;
​

x265_3.6.tar.gz/source/encoder/framefilter.h -> x265_4.0.tar.gz/source/encoder/framefilter.h Changed

 
@@ -128,8 +128,8 @@
 
     void start(Frame *pic, Entropy& initState);
 
-    void processRow(int row);
-    void processPostRow(int row);
+    void processRow(int row, int layer);
+    void processPostRow(int row, int layer);
     void computeMEIntegral(int row);
 };
 }
​

x265_3.6.tar.gz/source/encoder/level.cpp -> x265_4.0.tar.gz/source/encoder/level.cpp Changed

@@ -60,6 +60,42 @@
     { MAX_UINT, MAX_UINT, MAX_UINT, MAX_UINT, MAX_UINT, MAX_UINT, 1, Level::LEVEL8_5, "8.5", 85 },
 };
 
+#if ENABLE_SCC_EXT
+enum SCCProfileName
+{
+    NONE = 0,
+    // The following are SCC profiles, which would map to the MAINSCC profile idc.
+    // The enumeration indicates the bit-depth constraint in the bottom 2 digits
+    //                           the chroma format in the next digit
+    //                           the intra constraint in the next digit
+    //                           If it is a SCC profile there is a '2' for the next digit.
+    //                           If it is a highthroughput , there is a '2' for the top digit else '1' for the top digit
+    SCC_MAIN = 121108,
+    SCC_MAIN_10 = 121110,
+    SCC_MAIN_444 = 121308,
+    SCC_MAIN_444_10 = 121310,
+};
+
+static const SCCProfileName validSCCProfileNames14/* bit depth constraint 8=0, 10=1, 12=2, 14=3*/4/*chroma format*/ =
+{
+   {
+        { NONE,         SCC_MAIN,      NONE,      SCC_MAIN_444                     }, // 8-bit  intra for 400, 420, 422 and 444
+        { NONE,         SCC_MAIN_10,   NONE,      SCC_MAIN_444_10                  }, // 10-bit intra for 400, 420, 422 and 444
+        { NONE,         NONE,          NONE,      NONE                             }, // 12-bit intra for 400, 420, 422 and 444
+        { NONE,         NONE,          NONE,      NONE                             }  // 16-bit intra for 400, 420, 422 and 444
+    },
+};
+#endif
+
+static inline int _confirm(x265_param* param, bool bflag, const char* message)
+{
+    if (!bflag)
+        return 0;
+
+    x265_log(param, X265_LOG_ERROR, "%s\n", message);
+    return 1;
+}
+
 /* determine minimum decoder level required to decode the described video */
 void determineLevel(const x265_param &param, VPS& vps)
 {
@@ -80,45 +116,74 @@
         if (param.internalBitDepth <= 8)
         {
             if (vps.ptl.onePictureOnlyConstraintFlag)
-                vps.ptl.profileIdc = Profile::MAINSTILLPICTURE;
+                vps.ptl.profileIdc0 = Profile::MAINSTILLPICTURE;
             else if (vps.ptl.intraConstraintFlag)
-                vps.ptl.profileIdc = Profile::MAINREXT; /* Main Intra */
+                vps.ptl.profileIdc0 = Profile::MAINREXT; /* Main Intra */
             else 
-                vps.ptl.profileIdc = Profile::MAIN;
+                vps.ptl.profileIdc0 = Profile::MAIN;
+
+#if ENABLE_ALPHA
+            if (param.numScalableLayers == 2)
+                vps.ptl.profileIdc1 = Profile::SCALABLEMAIN;
+#endif
         }
         else if (param.internalBitDepth <= 10)
         {
             /* note there is no 10bit still picture profile */
             if (vps.ptl.intraConstraintFlag)
-                vps.ptl.profileIdc = Profile::MAINREXT; /* Main10 Intra */
+                vps.ptl.profileIdc0 = Profile::MAINREXT; /* Main10 Intra */
             else
-                vps.ptl.profileIdc = Profile::MAIN10;
+                vps.ptl.profileIdc0 = Profile::MAIN10;
+
+#if ENABLE_ALPHA
+            if (param.numScalableLayers == 2)
+                vps.ptl.profileIdc1 = Profile::SCALABLEMAIN10;
+#endif
         }
     }
     else
-        vps.ptl.profileIdc = Profile::MAINREXT;
+        vps.ptl.profileIdc0 = Profile::MAINREXT;
+
+#if ENABLE_MULTIVIEW
+    if (param.numViews == 2)
+        vps.ptl.profileIdc1 = Profile::MULTIVIEWMAIN;
+#endif
+
+#if ENABLE_SCC_EXT
+    if (param.bEnableSCC)
+        vps.ptl.profileIdc0 = Profile::MAINSCC;
 
     /* determine which profiles are compatible with this stream */
+    if (vps.ptl.profileIdc0 == Profile::MAINSCC)
+    {
+        vps.ptl.onePictureOnlyConstraintFlag = false;
+        vps.ptl.intraConstraintFlag = param.keyframeMax <= 1 || vps.ptl.onePictureOnlyConstraintFlag;
+    }
+#endif
 
     memset(vps.ptl.profileCompatibilityFlag, 0, sizeof(vps.ptl.profileCompatibilityFlag));
-    vps.ptl.profileCompatibilityFlagvps.ptl.profileIdc = true;
-    if (vps.ptl.profileIdc == Profile::MAIN10 && param.internalBitDepth == 8)
+    vps.ptl.profileCompatibilityFlagvps.ptl.profileIdc0 = true;
+    if (vps.ptl.profileIdc0 == Profile::MAIN10 && param.internalBitDepth == 8)
         vps.ptl.profileCompatibilityFlagProfile::MAIN = true;
-    else if (vps.ptl.profileIdc == Profile::MAIN)
+    else if (vps.ptl.profileIdc0 == Profile::MAIN)
         vps.ptl.profileCompatibilityFlagProfile::MAIN10 = true;
-    else if (vps.ptl.profileIdc == Profile::MAINSTILLPICTURE)
+    else if (vps.ptl.profileIdc0 == Profile::MAINSTILLPICTURE)
     {
         vps.ptl.profileCompatibilityFlagProfile::MAIN = true;
         vps.ptl.profileCompatibilityFlagProfile::MAIN10 = true;
     }
-    else if (vps.ptl.profileIdc == Profile::MAINREXT)
+    else if (vps.ptl.profileIdc0 == Profile::MAINREXT)
         vps.ptl.profileCompatibilityFlagProfile::MAINREXT = true;
+#if ENABLE_SCC_EXT
+    else if (vps.ptl.profileIdc0 == Profile::MAINSCC)
+        vps.ptl.profileCompatibilityFlagProfile::MAINSCC = true;
+#endif
 
     uint32_t lumaSamples = param.sourceWidth * param.sourceHeight;
     uint32_t samplesPerSec = (uint32_t)(lumaSamples * ((double)param.fpsNum / param.fpsDenom));
     uint32_t bitrate = param.rc.vbvMaxBitrate ? param.rc.vbvMaxBitrate : param.rc.bitrate;
 
-    const uint32_t MaxDpbPicBuf = 6;
+    const uint32_t MaxDpbPicBuf = param.bEnableSCC ? 7 : 6;
     vps.ptl.levelIdc = Level::NONE;
     vps.ptl.tierFlag = Level::MAIN;
 
@@ -174,7 +239,7 @@
         if (levelsi.levelEnum >= Level::LEVEL5 && param.maxCUSize < 32)
         {
             x265_log(&param, X265_LOG_WARNING, "level %s detected, but CTU size 16 is non-compliant\n", levelsi.name);
-            vps.ptl.profileIdc = Profile::NONE;
+            vps.ptl.profileIdc0 = Profile::NONE;
             vps.ptl.levelIdc = Level::NONE;
             vps.ptl.tierFlag = Level::MAIN;
             x265_log(&param, X265_LOG_INFO, "NONE profile, Level-NONE (Main tier)\n");
@@ -186,7 +251,7 @@
         if (numPocTotalCurr > 10)
         {
             x265_log(&param, X265_LOG_WARNING, "level %s detected, but NumPocTotalCurr (total references) is non-compliant\n", levelsi.name);
-            vps.ptl.profileIdc = Profile::NONE;
+            vps.ptl.profileIdc0 = Profile::NONE;
             vps.ptl.levelIdc = Level::NONE;
             vps.ptl.tierFlag = Level::MAIN;
             x265_log(&param, X265_LOG_INFO, "NONE profile, Level-NONE (Main tier)\n");
@@ -217,14 +282,32 @@
         break;
     }
 
-    static const char *profiles = { "None", "Main", "Main 10", "Main Still Picture", "RExt" };
+#if ENABLE_SCC_EXT
+    x265_param m_param = param;
+#define CHECK(expr, msg) check_failed |= _confirm(&m_param, expr, msg)
+    int check_failed = 0; /* abort if there is a fatal configuration problem */
+
+    if (vps.ptl.profileIdc0 == Profile::MAINSCC)
+    {
+        CHECK(vps.ptl.lowerBitRateConstraintFlag == false && vps.ptl.intraConstraintFlag == false, "The lowerBitRateConstraint flag cannot be false when intraConstraintFlag is false");
+        CHECK(param.bEnableSCC && !(vps.ptl.profileIdc0 == Profile::MAINSCC), "UseIntraBlockCopy must not be enabled unless the SCC profile is being used.");
+        CHECK(vps.ptl.intraConstraintFlag, "intra constraint flag must be 0 for SCC profiles");
+        CHECK(vps.ptl.onePictureOnlyConstraintFlag, "one-picture-only constraint flag shall be 0 for SCC profiles");
+        const uint32_t bitDepthIdx = (vps.ptl.bitDepthConstraint == 8 ? 0 : (vps.ptl.bitDepthConstraint == 10 ? 1 : (vps.ptl.bitDepthConstraint == 12 ? 2 : (vps.ptl.bitDepthConstraint == 16 ? 3 : 4))));
+        const uint32_t chromaFormatIdx = uint32_t(vps.ptl.chromaFormatConstraint);
+        const bool bValidProfile = (bitDepthIdx > 2 || chromaFormatIdx > 3) ? false : (validSCCProfileNames0bitDepthIdxchromaFormatIdx != NONE);
+        CHECK(!bValidProfile, "Invalid intra constraint flag, bit depth constraint flag and chroma format constraint flag combination for a RExt profile");
+    }
+#endif
+
+    static const char* profiles = { "None", "Main", "Main 10", "Main Still Picture", "RExt", "", "", "", "", "Main Scc" };
     static const char *tiers    = { "Main", "High" };
 
     char profbuf64;
-    strcpy(profbuf, profilesvps.ptl.profileIdc);
+    strcpy(profbuf, profilesvps.ptl.profileIdc0);
 
     bool bStillPicture = false;
-    if (vps.ptl.profileIdc == Profile::MAINREXT)
+    if (vps.ptl.profileIdc0 == Profile::MAINREXT)
     {
         if (vps.ptl.bitDepthConstraint > 12 && vps.ptl.intraConstraintFlag)
         {
@@ -277,6 +360,27 @@
         if (vps.ptl.intraConstraintFlag && !bStillPicture)
             strcat(profbuf, " Intra");
     }
+
+#if ENABLE_SCC_EXT
+    if (vps.ptl.profileIdc0 == Profile::MAINSCC)
+    {
+        if (param.internalCsp == X265_CSP_I420)
+        {
+            if (vps.ptl.bitDepthConstraint <= 8)
+                strcpy(profbuf, "Main Scc");
+            else if (vps.ptl.bitDepthConstraint <= 10)
+                strcpy(profbuf, "Main 10 Scc");
+        }
+        else if (param.internalCsp == X265_CSP_I444)

 
@@ -60,6 +60,42 @@
     { MAX_UINT, MAX_UINT, MAX_UINT, MAX_UINT, MAX_UINT, MAX_UINT, 1, Level::LEVEL8_5, "8.5", 85 },
 };
 
+#if ENABLE_SCC_EXT
+enum SCCProfileName
+{
+    NONE = 0,
+    // The following are SCC profiles, which would map to the MAINSCC profile idc.
+    // The enumeration indicates the bit-depth constraint in the bottom 2 digits
+    //                           the chroma format in the next digit
+    //                           the intra constraint in the next digit
+    //                           If it is a SCC profile there is a '2' for the next digit.
+    //                           If it is a highthroughput , there is a '2' for the top digit else '1' for the top digit
+    SCC_MAIN = 121108,
+    SCC_MAIN_10 = 121110,
+    SCC_MAIN_444 = 121308,
+    SCC_MAIN_444_10 = 121310,
+};
+
+static const SCCProfileName validSCCProfileNames14/* bit depth constraint 8=0, 10=1, 12=2, 14=3*/4/*chroma format*/ =
+{
+   {
+        { NONE,         SCC_MAIN,      NONE,      SCC_MAIN_444                     }, // 8-bit  intra for 400, 420, 422 and 444
+        { NONE,         SCC_MAIN_10,   NONE,      SCC_MAIN_444_10                  }, // 10-bit intra for 400, 420, 422 and 444
+        { NONE,         NONE,          NONE,      NONE                             }, // 12-bit intra for 400, 420, 422 and 444
+        { NONE,         NONE,          NONE,      NONE                             }  // 16-bit intra for 400, 420, 422 and 444
+    },
+};
+#endif
+
+static inline int _confirm(x265_param* param, bool bflag, const char* message)
+{
+    if (!bflag)
+        return 0;
+
+    x265_log(param, X265_LOG_ERROR, "%s\n", message);
+    return 1;
+}
+
 /* determine minimum decoder level required to decode the described video */
 void determineLevel(const x265_param &param, VPS& vps)
 {
@@ -80,45 +116,74 @@
         if (param.internalBitDepth <= 8)
         {
             if (vps.ptl.onePictureOnlyConstraintFlag)
-                vps.ptl.profileIdc = Profile::MAINSTILLPICTURE;
+                vps.ptl.profileIdc0 = Profile::MAINSTILLPICTURE;
             else if (vps.ptl.intraConstraintFlag)
-                vps.ptl.profileIdc = Profile::MAINREXT; /* Main Intra */
+                vps.ptl.profileIdc0 = Profile::MAINREXT; /* Main Intra */
             else 
-                vps.ptl.profileIdc = Profile::MAIN;
+                vps.ptl.profileIdc0 = Profile::MAIN;
+
+#if ENABLE_ALPHA
+            if (param.numScalableLayers == 2)
+                vps.ptl.profileIdc1 = Profile::SCALABLEMAIN;
+#endif
         }
         else if (param.internalBitDepth <= 10)
         {
             /* note there is no 10bit still picture profile */
             if (vps.ptl.intraConstraintFlag)
-                vps.ptl.profileIdc = Profile::MAINREXT; /* Main10 Intra */
+                vps.ptl.profileIdc0 = Profile::MAINREXT; /* Main10 Intra */
             else
-                vps.ptl.profileIdc = Profile::MAIN10;
+                vps.ptl.profileIdc0 = Profile::MAIN10;
+
+#if ENABLE_ALPHA
+            if (param.numScalableLayers == 2)
+                vps.ptl.profileIdc1 = Profile::SCALABLEMAIN10;
+#endif
         }
     }
     else
-        vps.ptl.profileIdc = Profile::MAINREXT;
+        vps.ptl.profileIdc0 = Profile::MAINREXT;
+
+#if ENABLE_MULTIVIEW
+    if (param.numViews == 2)
+        vps.ptl.profileIdc1 = Profile::MULTIVIEWMAIN;
+#endif
+
+#if ENABLE_SCC_EXT
+    if (param.bEnableSCC)
+        vps.ptl.profileIdc0 = Profile::MAINSCC;
 
     /* determine which profiles are compatible with this stream */
+    if (vps.ptl.profileIdc0 == Profile::MAINSCC)
+    {
+        vps.ptl.onePictureOnlyConstraintFlag = false;
+        vps.ptl.intraConstraintFlag = param.keyframeMax <= 1 || vps.ptl.onePictureOnlyConstraintFlag;
+    }
+#endif
 
     memset(vps.ptl.profileCompatibilityFlag, 0, sizeof(vps.ptl.profileCompatibilityFlag));
-    vps.ptl.profileCompatibilityFlagvps.ptl.profileIdc = true;
-    if (vps.ptl.profileIdc == Profile::MAIN10 && param.internalBitDepth == 8)
+    vps.ptl.profileCompatibilityFlagvps.ptl.profileIdc0 = true;
+    if (vps.ptl.profileIdc0 == Profile::MAIN10 && param.internalBitDepth == 8)
         vps.ptl.profileCompatibilityFlagProfile::MAIN = true;
-    else if (vps.ptl.profileIdc == Profile::MAIN)
+    else if (vps.ptl.profileIdc0 == Profile::MAIN)
         vps.ptl.profileCompatibilityFlagProfile::MAIN10 = true;
-    else if (vps.ptl.profileIdc == Profile::MAINSTILLPICTURE)
+    else if (vps.ptl.profileIdc0 == Profile::MAINSTILLPICTURE)
     {
         vps.ptl.profileCompatibilityFlagProfile::MAIN = true;
         vps.ptl.profileCompatibilityFlagProfile::MAIN10 = true;
     }
-    else if (vps.ptl.profileIdc == Profile::MAINREXT)
+    else if (vps.ptl.profileIdc0 == Profile::MAINREXT)
         vps.ptl.profileCompatibilityFlagProfile::MAINREXT = true;
+#if ENABLE_SCC_EXT
+    else if (vps.ptl.profileIdc0 == Profile::MAINSCC)
+        vps.ptl.profileCompatibilityFlagProfile::MAINSCC = true;
+#endif
 
     uint32_t lumaSamples = param.sourceWidth * param.sourceHeight;
     uint32_t samplesPerSec = (uint32_t)(lumaSamples * ((double)param.fpsNum / param.fpsDenom));
     uint32_t bitrate = param.rc.vbvMaxBitrate ? param.rc.vbvMaxBitrate : param.rc.bitrate;
 
-    const uint32_t MaxDpbPicBuf = 6;
+    const uint32_t MaxDpbPicBuf = param.bEnableSCC ? 7 : 6;
     vps.ptl.levelIdc = Level::NONE;
     vps.ptl.tierFlag = Level::MAIN;
 
@@ -174,7 +239,7 @@
         if (levelsi.levelEnum >= Level::LEVEL5 && param.maxCUSize < 32)
         {
             x265_log(&param, X265_LOG_WARNING, "level %s detected, but CTU size 16 is non-compliant\n", levelsi.name);
-            vps.ptl.profileIdc = Profile::NONE;
+            vps.ptl.profileIdc0 = Profile::NONE;
             vps.ptl.levelIdc = Level::NONE;
             vps.ptl.tierFlag = Level::MAIN;
             x265_log(&param, X265_LOG_INFO, "NONE profile, Level-NONE (Main tier)\n");
@@ -186,7 +251,7 @@
         if (numPocTotalCurr > 10)
         {
             x265_log(&param, X265_LOG_WARNING, "level %s detected, but NumPocTotalCurr (total references) is non-compliant\n", levelsi.name);
-            vps.ptl.profileIdc = Profile::NONE;
+            vps.ptl.profileIdc0 = Profile::NONE;
             vps.ptl.levelIdc = Level::NONE;
             vps.ptl.tierFlag = Level::MAIN;
             x265_log(&param, X265_LOG_INFO, "NONE profile, Level-NONE (Main tier)\n");
@@ -217,14 +282,32 @@
         break;
     }
 
-    static const char *profiles = { "None", "Main", "Main 10", "Main Still Picture", "RExt" };
+#if ENABLE_SCC_EXT
+    x265_param m_param = param;
+#define CHECK(expr, msg) check_failed |= _confirm(&m_param, expr, msg)
+    int check_failed = 0; /* abort if there is a fatal configuration problem */
+
+    if (vps.ptl.profileIdc0 == Profile::MAINSCC)
+    {
+        CHECK(vps.ptl.lowerBitRateConstraintFlag == false && vps.ptl.intraConstraintFlag == false, "The lowerBitRateConstraint flag cannot be false when intraConstraintFlag is false");
+        CHECK(param.bEnableSCC && !(vps.ptl.profileIdc0 == Profile::MAINSCC), "UseIntraBlockCopy must not be enabled unless the SCC profile is being used.");
+        CHECK(vps.ptl.intraConstraintFlag, "intra constraint flag must be 0 for SCC profiles");
+        CHECK(vps.ptl.onePictureOnlyConstraintFlag, "one-picture-only constraint flag shall be 0 for SCC profiles");
+        const uint32_t bitDepthIdx = (vps.ptl.bitDepthConstraint == 8 ? 0 : (vps.ptl.bitDepthConstraint == 10 ? 1 : (vps.ptl.bitDepthConstraint == 12 ? 2 : (vps.ptl.bitDepthConstraint == 16 ? 3 : 4))));
+        const uint32_t chromaFormatIdx = uint32_t(vps.ptl.chromaFormatConstraint);
+        const bool bValidProfile = (bitDepthIdx > 2 || chromaFormatIdx > 3) ? false : (validSCCProfileNames0bitDepthIdxchromaFormatIdx != NONE);
+        CHECK(!bValidProfile, "Invalid intra constraint flag, bit depth constraint flag and chroma format constraint flag combination for a RExt profile");
+    }
+#endif
+
+    static const char* profiles = { "None", "Main", "Main 10", "Main Still Picture", "RExt", "", "", "", "", "Main Scc" };
     static const char *tiers    = { "Main", "High" };
 
     char profbuf64;
-    strcpy(profbuf, profilesvps.ptl.profileIdc);
+    strcpy(profbuf, profilesvps.ptl.profileIdc0);
 
     bool bStillPicture = false;
-    if (vps.ptl.profileIdc == Profile::MAINREXT)
+    if (vps.ptl.profileIdc0 == Profile::MAINREXT)
     {
         if (vps.ptl.bitDepthConstraint > 12 && vps.ptl.intraConstraintFlag)
         {
@@ -277,6 +360,27 @@
         if (vps.ptl.intraConstraintFlag && !bStillPicture)
             strcat(profbuf, " Intra");
     }
+
+#if ENABLE_SCC_EXT
+    if (vps.ptl.profileIdc0 == Profile::MAINSCC)
+    {
+        if (param.internalCsp == X265_CSP_I420)
+        {
+            if (vps.ptl.bitDepthConstraint <= 8)
+                strcpy(profbuf, "Main Scc");
+            else if (vps.ptl.bitDepthConstraint <= 10)
+                strcpy(profbuf, "Main 10 Scc");
+        }
+        else if (param.internalCsp == X265_CSP_I444)
​

x265_3.6.tar.gz/source/encoder/motion.cpp -> x265_4.0.tar.gz/source/encoder/motion.cpp Changed

 
@@ -770,6 +770,7 @@
                                    int              merange,
                                    MV &             outQMv,
                                    uint32_t         maxSlices,
+                                    bool            m_vertRestriction,
                                    pixel *          srcReferencePlane)
 {
     ALIGN_VAR_16(int, costs16);
@@ -794,6 +795,13 @@
 
     // measure SAD cost at clipped QPEL MVP
     MV pmv = qmvp.clipped(qmvmin, qmvmax);
+    if (m_vertRestriction)
+    {
+        if (pmv.y > mvmax.y << 2)
+        {
+            pmv.y = (mvmax.y << 2);
+        }
+    }
     MV bestpre = pmv;
     int bprecost;
 
​

x265_3.6.tar.gz/source/encoder/motion.h -> x265_4.0.tar.gz/source/encoder/motion.h Changed

 
@@ -95,7 +95,7 @@
     }
 
     void refineMV(ReferencePlanes* ref, const MV& mvmin, const MV& mvmax, const MV& qmvp, MV& outQMv);
-    int motionEstimate(ReferencePlanes* ref, const MV & mvmin, const MV & mvmax, const MV & qmvp, int numCandidates, const MV * mvc, int merange, MV & outQMv, uint32_t maxSlices, pixel *srcReferencePlane = 0);
+    int motionEstimate(ReferencePlanes* ref, const MV & mvmin, const MV & mvmax, const MV & qmvp, int numCandidates, const MV * mvc, int merange, MV & outQMv, uint32_t maxSlices, bool m_vertRestriction, pixel *srcReferencePlane = 0);
 
     int subpelCompare(ReferencePlanes* ref, const MV &qmv, pixelcmp_t);
 
​

x265_3.6.tar.gz/source/encoder/nal.cpp -> x265_4.0.tar.gz/source/encoder/nal.cpp Changed

 
@@ -57,7 +57,7 @@
     other.m_buffer = X265_MALLOC(uint8_t, m_allocSize);
 }
 
-void NALList::serialize(NalUnitType nalUnitType, const Bitstream& bs, uint8_t temporalID)
+void NALList::serialize(NalUnitType nalUnitType, const Bitstream& bs, int layerId, uint8_t temporalID)
 {
     static const char startCodePrefix = { 0, 0, 0, 1 };
 
@@ -114,7 +114,7 @@
      * nuh_reserved_zero_6bits  6-bits
      * nuh_temporal_id_plus1    3-bits */
     outbytes++ = (uint8_t)nalUnitType << 1;
-    outbytes++ = temporalID;
+    outbytes++ = (layerId << 3) | (temporalID);
 
     /* 7.4.1 ...
      * Within the NAL unit, the following three-byte sequences shall not occur at
​

x265_3.6.tar.gz/source/encoder/nal.h -> x265_4.0.tar.gz/source/encoder/nal.h Changed

 
@@ -35,7 +35,11 @@
 class NALList
 {
 public:
+#if ENABLE_MULTIVIEW || ENABLE_ALPHA
+    static const int MAX_NAL_UNITS = 32;
+#else
     static const int MAX_NAL_UNITS = 16;
+#endif
 
 public:
 
@@ -56,7 +60,7 @@
 
     void takeContents(NALList& other);
 
-    void serialize(NalUnitType nalUnitType, const Bitstream& bs, uint8_t temporalID = 1);
+    void serialize(NalUnitType nalUnitType, const Bitstream& bs, int layerId = 0, uint8_t temporalID = 1);
 
     uint32_t serializeSubstreams(uint32_t* streamSizeBytes, uint32_t streamCount, const Bitstream* streams);
 };
​

x265_3.6.tar.gz/source/encoder/ratecontrol.cpp -> x265_4.0.tar.gz/source/encoder/ratecontrol.cpp Changed

 
@@ -1349,6 +1349,10 @@
     FrameData& curEncData = *curFrame->m_encData;
     m_curSlice = curEncData.m_slice;
     m_sliceType = m_curSlice->m_sliceType;
+#if ENABLE_SCC_EXT
+    if(m_param->bEnableSCC)
+        m_sliceType = m_curSlice->m_origSliceType;
+#endif
     rce->sliceType = m_sliceType;
     if (!m_2pass)
         rce->keptAsRef = IS_REFERENCED(curFrame);
@@ -1466,7 +1470,7 @@
 
         int mincr = enc->m_vps.ptl.minCrForLevel;
         /* Profiles above Main10 don't require maxAU size check, so just set the maximum to a large value. */
-        if (enc->m_vps.ptl.profileIdc > Profile::MAIN10 || enc->m_vps.ptl.levelIdc == Level::NONE)
+        if (enc->m_vps.ptl.profileIdc0 > Profile::MAIN10 || enc->m_vps.ptl.levelIdc == Level::NONE)
             rce->frameSizeMaximum = 1e9;
         else
         {
​

x265_3.6.tar.gz/source/encoder/sao.cpp -> x265_4.0.tar.gz/source/encoder/sao.cpp Changed

@@ -36,12 +36,6 @@
     return num >= 0 ? ((num * 2 + den) / (den * 2)) : -((-num * 2 + den) / (den * 2));
 }
 
-/* get the sign of input variable (TODO: this is a dup, make common) */
-inline int8_t signOf(int x)
-{
-    return (x >> 31) | ((int)((((uint32_t)-x)) >> 31));
-}
-
 inline int signOf2(const int a, const int b)
 {
     // NOTE: don't reorder below compare, both ICL, VC, GCC optimize strong depends on order!
@@ -273,7 +267,7 @@
 // CTU-based SAO process without slice granularity
 void SAO::applyPixelOffsets(int addr, int typeIdx, int plane)
 {
-    PicYuv* reconPic = m_frame->m_reconPic;
+    PicYuv* reconPic = m_frame->m_reconPic0;
     pixel* rec = reconPic->getPlaneAddr(plane, addr);
     intptr_t stride = plane ? reconPic->m_strideC : reconPic->m_stride;
     uint32_t picWidth  = m_param->sourceWidth;
@@ -328,10 +322,10 @@
         {
             for (int y = 0; y < ctuHeight; y++, rec += stride)
             {
-                int signLeft = signOf(recstartX - tmpLy);
+                int signLeft = x265_signOf(recstartX - tmpLy);
                 for (int x = startX; x < endX; x++)
                 {
-                    int signRight = signOf(recx - recx + 1);
+                    int signRight = x265_signOf(recx - recx + 1);
                     int edgeType = signRight + signLeft + 2;
                     signLeft = -signRight;
 
@@ -343,8 +337,8 @@
         {
             for (int y = 0; y < ctuHeight; y += 2, rec += 2 * stride)
             {
-                signLeft10 = signOf(recstartX - tmpLy);
-                signLeft11 = signOf(recstride + startX - tmpLy + 1);
+                signLeft10 = x265_signOf(recstartX - tmpLy);
+                signLeft11 = x265_signOf(recstride + startX - tmpLy + 1);
 
                 if (!lpelx)
                 {
@@ -385,13 +379,13 @@
         if (ctuWidth & 15)
         {
             for (int x = 0; x < ctuWidth; x++)
-                upBuff1x = signOf(recx - tmpUx);
+                upBuff1x = x265_signOf(recx - tmpUx);
 
             for (int y = startY; y < endY; y++, rec += stride)
             {
                 for (int x = 0; x < ctuWidth; x++)
                 {
-                    int8_t signDown = signOf(recx - recx + stride);
+                    int8_t signDown = x265_signOf(recx - recx + stride);
                     int edgeType = signDown + upBuff1x + 2;
                     upBuff1x = -signDown;
 
@@ -445,17 +439,17 @@
         else
         {
             for (int x = startX; x < endX; x++)
-                upBuff1x = signOf(recx - tmpUx - 1);
+                upBuff1x = x265_signOf(recx - tmpUx - 1);
         }
 
         if (ctuWidth & 15)
         {
              for (int y = startY; y < endY; y++, rec += stride)
              {
-                 upBufftstartX = signOf(recstride + startX - tmpLy);
+                 upBufftstartX = x265_signOf(recstride + startX - tmpLy);
                  for (int x = startX; x < endX; x++)
                  {
-                     int8_t signDown = signOf(recx - recx + stride + 1);
+                     int8_t signDown = x265_signOf(recx - recx + stride + 1);
                      int edgeType = signDown + upBuff1x + 2;
                      upBufftx + 1 = -signDown;
                      recx = m_clipTablerecx + offsetEoedgeType;
@@ -468,7 +462,7 @@
         {
             for (int y = startY; y < endY; y++, rec += stride)
             {
-                int8_t iSignDown2 = signOf(recstride + startX - tmpLy);
+                int8_t iSignDown2 = x265_signOf(recstride + startX - tmpLy);
 
                 primitives.saoCuOrgE2endX > 16(rec + startX, upBufft + startX, upBuff1 + startX, offsetEo, endX - startX, stride);
 
@@ -493,25 +487,25 @@
         if (ctuWidth & 15)
         {
             for (int x = startX - 1; x < endX; x++)
-                upBuff1x = signOf(recx - tmpUx + 1);
+                upBuff1x = x265_signOf(recx - tmpUx + 1);
 
             for (int y = startY; y < endY; y++, rec += stride)
             {
                 int x = startX;
-                int8_t signDown = signOf(recx - tmpLy + 1);
+                int8_t signDown = x265_signOf(recx - tmpLy + 1);
                 int edgeType = signDown + upBuff1x + 2;
                 upBuff1x - 1 = -signDown;
                 recx = m_clipTablerecx + offsetEoedgeType;
 
                 for (x = startX + 1; x < endX; x++)
                 {
-                    signDown = signOf(recx - recx + stride - 1);
+                    signDown = x265_signOf(recx - recx + stride - 1);
                     edgeType = signDown + upBuff1x + 2;
                     upBuff1x - 1 = -signDown;
                     recx = m_clipTablerecx + offsetEoedgeType;
                 }
 
-                upBuff1endX - 1 = signOf(recendX - 1 + stride - recendX);
+                upBuff1endX - 1 = x265_signOf(recendX - 1 + stride - recendX);
             }
         }
         else
@@ -519,7 +513,7 @@
             int8_t firstSign, lastSign;
 
             if (lpelx)
-                firstSign = signOf(rec-1 - tmpU0);
+                firstSign = x265_signOf(rec-1 - tmpU0);
             if (rpelx == picWidth)
                 lastSign = upBuff1ctuWidth - 1;
 
@@ -533,14 +527,14 @@
             for (int y = startY; y < endY; y++, rec += stride)
             {
                 int x = startX;
-                int8_t signDown = signOf(recx - tmpLy + 1);
+                int8_t signDown = x265_signOf(recx - tmpLy + 1);
                 int edgeType = signDown + upBuff1x + 2;
                 upBuff1x - 1 = -signDown;
                 recx = m_clipTablerecx + offsetEoedgeType;
 
                 primitives.saoCuOrgE3endX > 16(rec, upBuff1, offsetEo, stride - 1, startX, endX);
 
-                upBuff1endX - 1 = signOf(recendX - 1 + stride - recendX);
+                upBuff1endX - 1 = x265_signOf(recendX - 1 + stride - recendX);
             }
         }
 
@@ -571,7 +565,7 @@
 /* Process SAO unit */
 void SAO::generateLumaOffsets(SaoCtuParam* ctuParam, int idxY, int idxX)
 {
-    PicYuv* reconPic = m_frame->m_reconPic;
+    PicYuv* reconPic = m_frame->m_reconPic0;
     intptr_t stride = reconPic->m_stride;
     int ctuWidth = m_param->maxCUSize;
     int ctuHeight = m_param->maxCUSize;
@@ -631,7 +625,7 @@
 /* Process SAO unit (Chroma only) */
 void SAO::generateChromaOffsets(SaoCtuParam* ctuParam3, int idxY, int idxX)
 {
-    PicYuv* reconPic = m_frame->m_reconPic;
+    PicYuv* reconPic = m_frame->m_reconPic0;
     intptr_t stride = reconPic->m_strideC;
     int ctuWidth  = m_param->maxCUSize;
     int ctuHeight = m_param->maxCUSize;
@@ -735,7 +729,7 @@
 void SAO::calcSaoStatsCTU(int addr, int plane)
 {
     Slice* slice = m_frame->m_encData->m_slice;
-    const PicYuv* reconPic = m_frame->m_reconPic;
+    const PicYuv* reconPic = m_frame->m_reconPic0;
     const CUData* cu = m_frame->m_encData->getPicCTU(addr);
     const pixel* fenc0 = m_frame->m_fencPic->getPlaneAddr(plane, addr);
     const pixel* rec0  = reconPic->getPlaneAddr(plane, addr);
@@ -922,7 +916,7 @@
 
     int x, y;
     const CUData* cu = frame->m_encData->getPicCTU(addr);
-    const PicYuv* reconPic = m_frame->m_reconPic;
+    const PicYuv* reconPic = m_frame->m_reconPic0;
     const pixel* fenc;
     const pixel* rec;
     intptr_t stride = reconPic->m_stride;
@@ -1030,10 +1024,10 @@
             for (y = 0; y < ctuHeight; y++)
             {
                 x = (y < startY ? startX : firstX);
-                int signLeft = signOf(recx - recx - 1);
+                int signLeft = x265_signOf(recx - recx - 1);
                 for (; x < endX; x++)
                 {
-                    int signRight = signOf(recx - recx + 1);
+                    int signRight = x265_signOf(recx - recx + 1);
                     int edgeType = signRight + signLeft + 2;
                     signLeft = -signRight;
 
@@ -1069,13 +1063,13 @@
             }

 
@@ -36,12 +36,6 @@
     return num >= 0 ? ((num * 2 + den) / (den * 2)) : -((-num * 2 + den) / (den * 2));
 }
 
-/* get the sign of input variable (TODO: this is a dup, make common) */
-inline int8_t signOf(int x)
-{
-    return (x >> 31) | ((int)((((uint32_t)-x)) >> 31));
-}
-
 inline int signOf2(const int a, const int b)
 {
     // NOTE: don't reorder below compare, both ICL, VC, GCC optimize strong depends on order!
@@ -273,7 +267,7 @@
 // CTU-based SAO process without slice granularity
 void SAO::applyPixelOffsets(int addr, int typeIdx, int plane)
 {
-    PicYuv* reconPic = m_frame->m_reconPic;
+    PicYuv* reconPic = m_frame->m_reconPic0;
     pixel* rec = reconPic->getPlaneAddr(plane, addr);
     intptr_t stride = plane ? reconPic->m_strideC : reconPic->m_stride;
     uint32_t picWidth  = m_param->sourceWidth;
@@ -328,10 +322,10 @@
         {
             for (int y = 0; y < ctuHeight; y++, rec += stride)
             {
-                int signLeft = signOf(recstartX - tmpLy);
+                int signLeft = x265_signOf(recstartX - tmpLy);
                 for (int x = startX; x < endX; x++)
                 {
-                    int signRight = signOf(recx - recx + 1);
+                    int signRight = x265_signOf(recx - recx + 1);
                     int edgeType = signRight + signLeft + 2;
                     signLeft = -signRight;
 
@@ -343,8 +337,8 @@
         {
             for (int y = 0; y < ctuHeight; y += 2, rec += 2 * stride)
             {
-                signLeft10 = signOf(recstartX - tmpLy);
-                signLeft11 = signOf(recstride + startX - tmpLy + 1);
+                signLeft10 = x265_signOf(recstartX - tmpLy);
+                signLeft11 = x265_signOf(recstride + startX - tmpLy + 1);
 
                 if (!lpelx)
                 {
@@ -385,13 +379,13 @@
         if (ctuWidth & 15)
         {
             for (int x = 0; x < ctuWidth; x++)
-                upBuff1x = signOf(recx - tmpUx);
+                upBuff1x = x265_signOf(recx - tmpUx);
 
             for (int y = startY; y < endY; y++, rec += stride)
             {
                 for (int x = 0; x < ctuWidth; x++)
                 {
-                    int8_t signDown = signOf(recx - recx + stride);
+                    int8_t signDown = x265_signOf(recx - recx + stride);
                     int edgeType = signDown + upBuff1x + 2;
                     upBuff1x = -signDown;
 
@@ -445,17 +439,17 @@
         else
         {
             for (int x = startX; x < endX; x++)
-                upBuff1x = signOf(recx - tmpUx - 1);
+                upBuff1x = x265_signOf(recx - tmpUx - 1);
         }
 
         if (ctuWidth & 15)
         {
              for (int y = startY; y < endY; y++, rec += stride)
              {
-                 upBufftstartX = signOf(recstride + startX - tmpLy);
+                 upBufftstartX = x265_signOf(recstride + startX - tmpLy);
                  for (int x = startX; x < endX; x++)
                  {
-                     int8_t signDown = signOf(recx - recx + stride + 1);
+                     int8_t signDown = x265_signOf(recx - recx + stride + 1);
                      int edgeType = signDown + upBuff1x + 2;
                      upBufftx + 1 = -signDown;
                      recx = m_clipTablerecx + offsetEoedgeType;
@@ -468,7 +462,7 @@
         {
             for (int y = startY; y < endY; y++, rec += stride)
             {
-                int8_t iSignDown2 = signOf(recstride + startX - tmpLy);
+                int8_t iSignDown2 = x265_signOf(recstride + startX - tmpLy);
 
                 primitives.saoCuOrgE2endX > 16(rec + startX, upBufft + startX, upBuff1 + startX, offsetEo, endX - startX, stride);
 
@@ -493,25 +487,25 @@
         if (ctuWidth & 15)
         {
             for (int x = startX - 1; x < endX; x++)
-                upBuff1x = signOf(recx - tmpUx + 1);
+                upBuff1x = x265_signOf(recx - tmpUx + 1);
 
             for (int y = startY; y < endY; y++, rec += stride)
             {
                 int x = startX;
-                int8_t signDown = signOf(recx - tmpLy + 1);
+                int8_t signDown = x265_signOf(recx - tmpLy + 1);
                 int edgeType = signDown + upBuff1x + 2;
                 upBuff1x - 1 = -signDown;
                 recx = m_clipTablerecx + offsetEoedgeType;
 
                 for (x = startX + 1; x < endX; x++)
                 {
-                    signDown = signOf(recx - recx + stride - 1);
+                    signDown = x265_signOf(recx - recx + stride - 1);
                     edgeType = signDown + upBuff1x + 2;
                     upBuff1x - 1 = -signDown;
                     recx = m_clipTablerecx + offsetEoedgeType;
                 }
 
-                upBuff1endX - 1 = signOf(recendX - 1 + stride - recendX);
+                upBuff1endX - 1 = x265_signOf(recendX - 1 + stride - recendX);
             }
         }
         else
@@ -519,7 +513,7 @@
             int8_t firstSign, lastSign;
 
             if (lpelx)
-                firstSign = signOf(rec-1 - tmpU0);
+                firstSign = x265_signOf(rec-1 - tmpU0);
             if (rpelx == picWidth)
                 lastSign = upBuff1ctuWidth - 1;
 
@@ -533,14 +527,14 @@
             for (int y = startY; y < endY; y++, rec += stride)
             {
                 int x = startX;
-                int8_t signDown = signOf(recx - tmpLy + 1);
+                int8_t signDown = x265_signOf(recx - tmpLy + 1);
                 int edgeType = signDown + upBuff1x + 2;
                 upBuff1x - 1 = -signDown;
                 recx = m_clipTablerecx + offsetEoedgeType;
 
                 primitives.saoCuOrgE3endX > 16(rec, upBuff1, offsetEo, stride - 1, startX, endX);
 
-                upBuff1endX - 1 = signOf(recendX - 1 + stride - recendX);
+                upBuff1endX - 1 = x265_signOf(recendX - 1 + stride - recendX);
             }
         }
 
@@ -571,7 +565,7 @@
 /* Process SAO unit */
 void SAO::generateLumaOffsets(SaoCtuParam* ctuParam, int idxY, int idxX)
 {
-    PicYuv* reconPic = m_frame->m_reconPic;
+    PicYuv* reconPic = m_frame->m_reconPic0;
     intptr_t stride = reconPic->m_stride;
     int ctuWidth = m_param->maxCUSize;
     int ctuHeight = m_param->maxCUSize;
@@ -631,7 +625,7 @@
 /* Process SAO unit (Chroma only) */
 void SAO::generateChromaOffsets(SaoCtuParam* ctuParam3, int idxY, int idxX)
 {
-    PicYuv* reconPic = m_frame->m_reconPic;
+    PicYuv* reconPic = m_frame->m_reconPic0;
     intptr_t stride = reconPic->m_strideC;
     int ctuWidth  = m_param->maxCUSize;
     int ctuHeight = m_param->maxCUSize;
@@ -735,7 +729,7 @@
 void SAO::calcSaoStatsCTU(int addr, int plane)
 {
     Slice* slice = m_frame->m_encData->m_slice;
-    const PicYuv* reconPic = m_frame->m_reconPic;
+    const PicYuv* reconPic = m_frame->m_reconPic0;
     const CUData* cu = m_frame->m_encData->getPicCTU(addr);
     const pixel* fenc0 = m_frame->m_fencPic->getPlaneAddr(plane, addr);
     const pixel* rec0  = reconPic->getPlaneAddr(plane, addr);
@@ -922,7 +916,7 @@
 
     int x, y;
     const CUData* cu = frame->m_encData->getPicCTU(addr);
-    const PicYuv* reconPic = m_frame->m_reconPic;
+    const PicYuv* reconPic = m_frame->m_reconPic0;
     const pixel* fenc;
     const pixel* rec;
     intptr_t stride = reconPic->m_stride;
@@ -1030,10 +1024,10 @@
             for (y = 0; y < ctuHeight; y++)
             {
                 x = (y < startY ? startX : firstX);
-                int signLeft = signOf(recx - recx - 1);
+                int signLeft = x265_signOf(recx - recx - 1);
                 for (; x < endX; x++)
                 {
-                    int signRight = signOf(recx - recx + 1);
+                    int signRight = x265_signOf(recx - recx + 1);
                     int edgeType = signRight + signLeft + 2;
                     signLeft = -signRight;
 
@@ -1069,13 +1063,13 @@
             }
 
​

x265_3.6.tar.gz/source/encoder/search.cpp -> x265_4.0.tar.gz/source/encoder/search.cpp Changed

@@ -76,6 +76,9 @@
     m_param = &param;
     m_bFrameParallel = param.frameNumThreads > 1;
     m_numLayers = g_log2Sizeparam.maxCUSize - 2;
+#if ENABLE_SCC_EXT
+    m_ibcEnabled = param.bEnableSCC;
+#endif
 
     m_rdCost.setPsyRdScale(param.psyRd);
     m_rdCost.setSsimRd(param.bSsimRd);
@@ -171,6 +174,11 @@
     CHECKED_MALLOC(m_tsResidual, int16_t, MAX_TS_SIZE * MAX_TS_SIZE);
     CHECKED_MALLOC(m_tsRecon,    pixel,   MAX_TS_SIZE * MAX_TS_SIZE);
 
+#if ENABLE_SCC_EXT
+    m_numBVs = 0;
+    m_numBV16s = 0;
+#endif
+
     return ok;
 
 fail:
@@ -496,7 +504,7 @@
     }
 
     // set reconstruction for next intra prediction blocks if full TU prediction won
-    PicYuv*  reconPic = m_frame->m_reconPic;
+    PicYuv*  reconPic = m_frame->m_reconPic0;
     pixel*   picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
     intptr_t picStride = reconPic->m_stride;
     primitives.cusizeIdx.copy_pp(picReconY, picStride, reconQt, reconQtStride);
@@ -672,7 +680,7 @@
     }
 
     // set reconstruction for next intra prediction blocks
-    PicYuv*  reconPic = m_frame->m_reconPic;
+    PicYuv*  reconPic = m_frame->m_reconPic0;
     pixel*   picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
     intptr_t picStride = reconPic->m_stride;
     primitives.cusizeIdx.copy_pp(picReconY, picStride, reconQt, reconQtStride);
@@ -723,7 +731,7 @@
         uint32_t sizeIdx   = log2TrSize - 2;
         primitives.cusizeIdx.calcresidualstride % 64 == 0(fenc, pred, residual, stride);
 
-        PicYuv*  reconPic = m_frame->m_reconPic;
+        PicYuv*  reconPic = m_frame->m_reconPic0;
         pixel*   picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
         intptr_t picStride = reconPic->m_stride;
 
@@ -887,7 +895,7 @@
             coeff_t* coeffC        = m_rqtqtLayer.coeffRQTchromaId + coeffOffsetC;
             pixel*   reconQt       = m_rqtqtLayer.reconQtYuv.getChromaAddr(chromaId, absPartIdxC);
             uint32_t reconQtStride = m_rqtqtLayer.reconQtYuv.m_csize;
-            PicYuv*  reconPic = m_frame->m_reconPic;
+            PicYuv*  reconPic = m_frame->m_reconPic0;
             pixel*   picReconC = reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + absPartIdxC);
             intptr_t picStride = reconPic->m_strideC;
 
@@ -1078,7 +1086,7 @@
             cu.setCbfPartRange(bCbf << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
             cu.setTransformSkipPartRange(bTSkip, ttype, absPartIdxC, tuIterator.absPartIdxStep);
 
-            PicYuv*  reconPic = m_frame->m_reconPic;
+            PicYuv*  reconPic = m_frame->m_reconPic0;
             pixel*   reconPicC = reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + absPartIdxC);
             intptr_t picStride = reconPic->m_strideC;
             primitives.cusizeIdxC.copy_pp(reconPicC, picStride, reconQt, reconQtStride);
@@ -1185,7 +1193,7 @@
             int16_t* residual = resiYuv.getChromaAddr(chromaId, absPartIdxC);
             uint32_t coeffOffsetC  = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
             coeff_t* coeffC        = cu.m_trCoeffttype + coeffOffsetC;
-            PicYuv*  reconPic = m_frame->m_reconPic;
+            PicYuv*  reconPic = m_frame->m_reconPic0;
             pixel*   picReconC = reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + absPartIdxC);
             intptr_t picStride = reconPic->m_strideC;
 
@@ -1284,6 +1292,11 @@
 
     updateModeCost(intraMode);
     checkDQP(intraMode, cuGeom);
+
+#if ENABLE_SCC_EXT
+    if (m_param->bEnableSCC)
+        intraMode.reconYuv.copyToPicYuv(*m_frame->m_reconPic1, cu.m_cuAddr, cuGeom.absPartIdx);
+#endif
 }
 
 /* Note that this function does not save the best intra prediction, it must
@@ -1671,7 +1684,7 @@
              * output recon picture, so it cannot proceed in parallel with anything else when doing INTRA_NXN. Also
              * it is not updating m_rdContextsdepth.cur for the later PUs which I suspect is slightly wrong. I think
              * that the contexts should be tracked through each PU */
-            PicYuv*  reconPic = m_frame->m_reconPic;
+            PicYuv*  reconPic = m_frame->m_reconPic0;
             pixel*   dst       = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
             uint32_t dststride = reconPic->m_stride;
             const pixel*   src = reconYuv->getLumaAddr(absPartIdx);
@@ -1844,7 +1857,7 @@
         if (!tuIterator.isLastSection())
         {
             uint32_t zorder    = cuGeom.absPartIdx + absPartIdxC;
-            PicYuv*  reconPic  = m_frame->m_reconPic;
+            PicYuv*  reconPic  = m_frame->m_reconPic0;
             uint32_t dststride = reconPic->m_strideC;
             const pixel* src;
             pixel* dst;
@@ -1895,7 +1908,9 @@
     MVField  candMvFieldMRG_MAX_NUM_CANDS2;
     uint8_t  candDirMRG_MAX_NUM_CANDS;
     uint32_t numMergeCand = cu.getInterMergeCandidates(pu.puAbsPartIdx, puIdx, candMvField, candDir);
-
+#if ENABLE_SCC_EXT
+    restrictBipredMergeCand(&cu, 0, candMvField, candDir, numMergeCand);
+#else
     if (cu.isBipredRestriction())
     {
         /* do not allow bidir merge candidates if PU is smaller than 8x8, drop L1 reference */
@@ -1908,6 +1923,7 @@
             }
         }
     }
+#endif
 
     Yuv& tempYuv = m_rqtcuGeom.depth.tmpPredYuv;
 
@@ -1936,6 +1952,12 @@
                 continue;
         }
 
+#if ENABLE_SCC_EXT
+        if ((candDirmergeCand == 1 || candDirmergeCand == 3) && (m_slice->m_refPOCList0candMvFieldmergeCand0.refIdx == m_slice->m_poc))
+        {
+            continue;
+        }
+#endif
         cu.m_mv0pu.puAbsPartIdx = candMvFieldmergeCand0.mv;
         cu.m_refIdx0pu.puAbsPartIdx = (int8_t)candMvFieldmergeCand0.refIdx;
         cu.m_mv1pu.puAbsPartIdx = candMvFieldmergeCand1.mv;
@@ -2015,7 +2037,12 @@
                 continue;
         }
         cu.clipMv(mvCand);
-        predInterLumaPixel(pu, tmpPredYuv, *m_slice->m_refReconPicListlistref, mvCand);
+#if ENABLE_SCC_EXT
+        if (m_slice->m_param->bEnableSCC && !list && ref == m_slice->m_numRefIdx0 - 1)
+            predInterLumaPixel(pu, tmpPredYuv, *m_slice->m_refFrameListlistref->m_reconPic1, mvCand);
+        else
+#endif
+            predInterLumaPixel(pu, tmpPredYuv, *m_slice->m_refReconPicListlistref, mvCand);
         costsi = m_me.bufSAD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size);
     }
 
@@ -2086,13 +2113,18 @@
 void Search::singleMotionEstimation(Search& master, Mode& interMode, const PredictionUnit& pu, int part, int list, int ref)
 {
     uint32_t bits = master.m_listSelBitslist + MVP_IDX_BITS;
-    bits += getTUBits(ref, m_slice->m_numRefIdxlist);
+    int numIdx = m_slice->m_numRefIdxlist;
+#if ENABLE_SCC_EXT
+    if (!list && m_ibcEnabled)
+        numIdx--;
+#endif
+    bits += getTUBits(ref, numIdx);
 
     MotionData* bestME = interMode.bestMEpart;
 
     // 12 mv candidates including lowresMV
     MV  mvc(MD_ABOVE_LEFT + 1) * 2 + 2;
-    int numMvc = interMode.cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCandlistref, mvc);
+    int numMvc = interMode.cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCandlistref, mvc, 0, pu.puAbsPartIdx);
 
     const MV* amvp = interMode.amvpCandlistref;
     int mvpIdx = selectMVP(interMode.cu, pu, amvp, list, ref);
@@ -2102,22 +2134,24 @@
     if (!m_param->analysisSave && !m_param->analysisLoad) /* Prevents load/save outputs from diverging if lowresMV is not available */
     {
         MV lmv = getLowresMV(interMode.cu, pu, list, ref);
-        if (lmv.notZero())
+        int layer = m_param->numViews > 1 ? m_frame->m_viewId : (m_param->numScalableLayers > 1) ? m_frame->m_sLayerId : 0;
+        if (lmv.notZero() && !layer)
             mvcnumMvc++ = lmv;
         if (m_param->bEnableHME)
             mvp_lowres = lmv;
     }
 
+    m_vertRestriction = interMode.cu.m_slice->m_refPOCListlistref == interMode.cu.m_slice->m_poc;
     setSearchRange(interMode.cu, mvp, m_param->searchRange, mvmin, mvmax);
 
-    int satdCost = m_me.motionEstimate(&m_slice->m_mreflistref, mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices, 
+    int satdCost = m_me.motionEstimate(&m_slice->m_mreflistref, mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices, m_vertRestriction,
       m_param->bSourceReferenceEstimation ? m_slice->m_refFrameListlistref->m_fencPic->getLumaAddr(0) : 0);
 
     if (m_param->bEnableHME && mvp_lowres.notZero() && mvp_lowres != mvp)
     {
         MV outmv_lowres;
         setSearchRange(interMode.cu, mvp_lowres, m_param->searchRange, mvmin, mvmax);
-        int lowresMvCost = m_me.motionEstimate(&m_slice->m_mreflistref, mvmin, mvmax, mvp_lowres, numMvc, mvc, m_param->searchRange, outmv_lowres, m_param->maxSlices,
+        int lowresMvCost = m_me.motionEstimate(&m_slice->m_mreflistref, mvmin, mvmax, mvp_lowres, numMvc, mvc, m_param->searchRange, outmv_lowres, m_param->maxSlices, m_vertRestriction,
             m_param->bSourceReferenceEstimation ? m_slice->m_refFrameListlistref->m_fencPic->getLumaAddr(0) : 0);
         if (lowresMvCost < satdCost)

 
@@ -76,6 +76,9 @@
     m_param = &param;
     m_bFrameParallel = param.frameNumThreads > 1;
     m_numLayers = g_log2Sizeparam.maxCUSize - 2;
+#if ENABLE_SCC_EXT
+    m_ibcEnabled = param.bEnableSCC;
+#endif
 
     m_rdCost.setPsyRdScale(param.psyRd);
     m_rdCost.setSsimRd(param.bSsimRd);
@@ -171,6 +174,11 @@
     CHECKED_MALLOC(m_tsResidual, int16_t, MAX_TS_SIZE * MAX_TS_SIZE);
     CHECKED_MALLOC(m_tsRecon,    pixel,   MAX_TS_SIZE * MAX_TS_SIZE);
 
+#if ENABLE_SCC_EXT
+    m_numBVs = 0;
+    m_numBV16s = 0;
+#endif
+
     return ok;
 
 fail:
@@ -496,7 +504,7 @@
     }
 
     // set reconstruction for next intra prediction blocks if full TU prediction won
-    PicYuv*  reconPic = m_frame->m_reconPic;
+    PicYuv*  reconPic = m_frame->m_reconPic0;
     pixel*   picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
     intptr_t picStride = reconPic->m_stride;
     primitives.cusizeIdx.copy_pp(picReconY, picStride, reconQt, reconQtStride);
@@ -672,7 +680,7 @@
     }
 
     // set reconstruction for next intra prediction blocks
-    PicYuv*  reconPic = m_frame->m_reconPic;
+    PicYuv*  reconPic = m_frame->m_reconPic0;
     pixel*   picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
     intptr_t picStride = reconPic->m_stride;
     primitives.cusizeIdx.copy_pp(picReconY, picStride, reconQt, reconQtStride);
@@ -723,7 +731,7 @@
         uint32_t sizeIdx   = log2TrSize - 2;
         primitives.cusizeIdx.calcresidualstride % 64 == 0(fenc, pred, residual, stride);
 
-        PicYuv*  reconPic = m_frame->m_reconPic;
+        PicYuv*  reconPic = m_frame->m_reconPic0;
         pixel*   picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
         intptr_t picStride = reconPic->m_stride;
 
@@ -887,7 +895,7 @@
             coeff_t* coeffC        = m_rqtqtLayer.coeffRQTchromaId + coeffOffsetC;
             pixel*   reconQt       = m_rqtqtLayer.reconQtYuv.getChromaAddr(chromaId, absPartIdxC);
             uint32_t reconQtStride = m_rqtqtLayer.reconQtYuv.m_csize;
-            PicYuv*  reconPic = m_frame->m_reconPic;
+            PicYuv*  reconPic = m_frame->m_reconPic0;
             pixel*   picReconC = reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + absPartIdxC);
             intptr_t picStride = reconPic->m_strideC;
 
@@ -1078,7 +1086,7 @@
             cu.setCbfPartRange(bCbf << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
             cu.setTransformSkipPartRange(bTSkip, ttype, absPartIdxC, tuIterator.absPartIdxStep);
 
-            PicYuv*  reconPic = m_frame->m_reconPic;
+            PicYuv*  reconPic = m_frame->m_reconPic0;
             pixel*   reconPicC = reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + absPartIdxC);
             intptr_t picStride = reconPic->m_strideC;
             primitives.cusizeIdxC.copy_pp(reconPicC, picStride, reconQt, reconQtStride);
@@ -1185,7 +1193,7 @@
             int16_t* residual = resiYuv.getChromaAddr(chromaId, absPartIdxC);
             uint32_t coeffOffsetC  = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
             coeff_t* coeffC        = cu.m_trCoeffttype + coeffOffsetC;
-            PicYuv*  reconPic = m_frame->m_reconPic;
+            PicYuv*  reconPic = m_frame->m_reconPic0;
             pixel*   picReconC = reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + absPartIdxC);
             intptr_t picStride = reconPic->m_strideC;
 
@@ -1284,6 +1292,11 @@
 
     updateModeCost(intraMode);
     checkDQP(intraMode, cuGeom);
+
+#if ENABLE_SCC_EXT
+    if (m_param->bEnableSCC)
+        intraMode.reconYuv.copyToPicYuv(*m_frame->m_reconPic1, cu.m_cuAddr, cuGeom.absPartIdx);
+#endif
 }
 
 /* Note that this function does not save the best intra prediction, it must
@@ -1671,7 +1684,7 @@
              * output recon picture, so it cannot proceed in parallel with anything else when doing INTRA_NXN. Also
              * it is not updating m_rdContextsdepth.cur for the later PUs which I suspect is slightly wrong. I think
              * that the contexts should be tracked through each PU */
-            PicYuv*  reconPic = m_frame->m_reconPic;
+            PicYuv*  reconPic = m_frame->m_reconPic0;
             pixel*   dst       = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
             uint32_t dststride = reconPic->m_stride;
             const pixel*   src = reconYuv->getLumaAddr(absPartIdx);
@@ -1844,7 +1857,7 @@
         if (!tuIterator.isLastSection())
         {
             uint32_t zorder    = cuGeom.absPartIdx + absPartIdxC;
-            PicYuv*  reconPic  = m_frame->m_reconPic;
+            PicYuv*  reconPic  = m_frame->m_reconPic0;
             uint32_t dststride = reconPic->m_strideC;
             const pixel* src;
             pixel* dst;
@@ -1895,7 +1908,9 @@
     MVField  candMvFieldMRG_MAX_NUM_CANDS2;
     uint8_t  candDirMRG_MAX_NUM_CANDS;
     uint32_t numMergeCand = cu.getInterMergeCandidates(pu.puAbsPartIdx, puIdx, candMvField, candDir);
-
+#if ENABLE_SCC_EXT
+    restrictBipredMergeCand(&cu, 0, candMvField, candDir, numMergeCand);
+#else
     if (cu.isBipredRestriction())
     {
         /* do not allow bidir merge candidates if PU is smaller than 8x8, drop L1 reference */
@@ -1908,6 +1923,7 @@
             }
         }
     }
+#endif
 
     Yuv& tempYuv = m_rqtcuGeom.depth.tmpPredYuv;
 
@@ -1936,6 +1952,12 @@
                 continue;
         }
 
+#if ENABLE_SCC_EXT
+        if ((candDirmergeCand == 1 || candDirmergeCand == 3) && (m_slice->m_refPOCList0candMvFieldmergeCand0.refIdx == m_slice->m_poc))
+        {
+            continue;
+        }
+#endif
         cu.m_mv0pu.puAbsPartIdx = candMvFieldmergeCand0.mv;
         cu.m_refIdx0pu.puAbsPartIdx = (int8_t)candMvFieldmergeCand0.refIdx;
         cu.m_mv1pu.puAbsPartIdx = candMvFieldmergeCand1.mv;
@@ -2015,7 +2037,12 @@
                 continue;
         }
         cu.clipMv(mvCand);
-        predInterLumaPixel(pu, tmpPredYuv, *m_slice->m_refReconPicListlistref, mvCand);
+#if ENABLE_SCC_EXT
+        if (m_slice->m_param->bEnableSCC && !list && ref == m_slice->m_numRefIdx0 - 1)
+            predInterLumaPixel(pu, tmpPredYuv, *m_slice->m_refFrameListlistref->m_reconPic1, mvCand);
+        else
+#endif
+            predInterLumaPixel(pu, tmpPredYuv, *m_slice->m_refReconPicListlistref, mvCand);
         costsi = m_me.bufSAD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size);
     }
 
@@ -2086,13 +2113,18 @@
 void Search::singleMotionEstimation(Search& master, Mode& interMode, const PredictionUnit& pu, int part, int list, int ref)
 {
     uint32_t bits = master.m_listSelBitslist + MVP_IDX_BITS;
-    bits += getTUBits(ref, m_slice->m_numRefIdxlist);
+    int numIdx = m_slice->m_numRefIdxlist;
+#if ENABLE_SCC_EXT
+    if (!list && m_ibcEnabled)
+        numIdx--;
+#endif
+    bits += getTUBits(ref, numIdx);
 
     MotionData* bestME = interMode.bestMEpart;
 
     // 12 mv candidates including lowresMV
     MV  mvc(MD_ABOVE_LEFT + 1) * 2 + 2;
-    int numMvc = interMode.cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCandlistref, mvc);
+    int numMvc = interMode.cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCandlistref, mvc, 0, pu.puAbsPartIdx);
 
     const MV* amvp = interMode.amvpCandlistref;
     int mvpIdx = selectMVP(interMode.cu, pu, amvp, list, ref);
@@ -2102,22 +2134,24 @@
     if (!m_param->analysisSave && !m_param->analysisLoad) /* Prevents load/save outputs from diverging if lowresMV is not available */
     {
         MV lmv = getLowresMV(interMode.cu, pu, list, ref);
-        if (lmv.notZero())
+        int layer = m_param->numViews > 1 ? m_frame->m_viewId : (m_param->numScalableLayers > 1) ? m_frame->m_sLayerId : 0;
+        if (lmv.notZero() && !layer)
             mvcnumMvc++ = lmv;
         if (m_param->bEnableHME)
             mvp_lowres = lmv;
     }
 
+    m_vertRestriction = interMode.cu.m_slice->m_refPOCListlistref == interMode.cu.m_slice->m_poc;
     setSearchRange(interMode.cu, mvp, m_param->searchRange, mvmin, mvmax);
 
-    int satdCost = m_me.motionEstimate(&m_slice->m_mreflistref, mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices, 
+    int satdCost = m_me.motionEstimate(&m_slice->m_mreflistref, mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices, m_vertRestriction,
       m_param->bSourceReferenceEstimation ? m_slice->m_refFrameListlistref->m_fencPic->getLumaAddr(0) : 0);
 
     if (m_param->bEnableHME && mvp_lowres.notZero() && mvp_lowres != mvp)
     {
         MV outmv_lowres;
         setSearchRange(interMode.cu, mvp_lowres, m_param->searchRange, mvmin, mvmax);
-        int lowresMvCost = m_me.motionEstimate(&m_slice->m_mreflistref, mvmin, mvmax, mvp_lowres, numMvc, mvc, m_param->searchRange, outmv_lowres, m_param->maxSlices,
+        int lowresMvCost = m_me.motionEstimate(&m_slice->m_mreflistref, mvmin, mvmax, mvp_lowres, numMvc, mvc, m_param->searchRange, outmv_lowres, m_param->maxSlices, m_vertRestriction,
             m_param->bSourceReferenceEstimation ? m_slice->m_refFrameListlistref->m_fencPic->getLumaAddr(0) : 0);
         if (lowresMvCost < satdCost)
​

x265_3.6.tar.gz/source/encoder/search.h -> x265_4.0.tar.gz/source/encoder/search.h Changed

@@ -286,6 +286,16 @@
     int32_t         m_sliceMaxY;
     int32_t         m_sliceMinY;
 
+    bool            m_vertRestriction;
+
+#if ENABLE_SCC_EXT
+    int             m_ibcEnabled;
+    int             m_numBVs;
+    int             m_numBV16s;
+    MV              m_BVs64;
+    uint32_t        m_lastCandCost;
+#endif
+
 #if DETAILED_CU_STATS
     /* Accumulate CU statistics separately for each frame encoder */
     CUStats         m_statsX265_MAX_FRAME_THREADS;
@@ -309,7 +319,7 @@
     void     encodeIntraInInter(Mode& intraMode, const CUGeom& cuGeom);
 
     // estimation inter prediction (non-skip)
-    void     predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChromaMC, uint32_t masks2);
+    void      predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChromaMC, uint32_t masks2, MV* iMVCandList = NULL);
     void     searchMV(Mode& interMode, int list, int ref, MV& outmv, MV mvp3, int numMvc, MV* mvc);
     // encode residual and compute rd-cost for inter mode
     void     encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom);
@@ -329,6 +339,25 @@
 
     MV getLowresMV(const CUData& cu, const PredictionUnit& pu, int list, int ref);
 
+#if ENABLE_SCC_EXT
+    bool      predIntraBCSearch(Mode& intraBCMode, const CUGeom& cuGeom, bool bChromaMC, PartSize ePartSize, bool testOnlyPred, bool bUse1DSearchFor8x8, IBC& ibc);
+    void      intraBlockCopyEstimate(Mode& intraBCMode, const CUGeom& cuGeom, int puIdx, MV* pred, MV& mv, uint32_t& cost, bool testOnlyPred, bool bUse1DSearchFor8x8, IBC& ibc);
+    void      setIntraSearchRange(Mode& intraBCMode, MV& pred, int puIdx, int roiWidth, int roiHeight, MV& searchRangeLT, MV& searchRangeRB);
+    void      intraPatternSearch(Mode& intraBCMode, const CUGeom& cuGeom, int puIdx, uint32_t partAddr, pixel* refY, int refStride, MV* searchRangeLT, MV* searchRangeRB,
+        MV& mv, uint32_t& cost, int roiwidth, int roiheight, bool testOnlyPred, bool bUse1DSearchFor8x8, IBC& ibc);
+    bool      isValidIntraBCSearchArea(CUData* cu, int predX, int predY, int roiWidth, int roiHeight, int partOffset);
+    bool      isBlockVectorValid(int xPos, int yPos, int width, int height, CUData* pcCU,
+        int xStartInCU, int yStartInCU, int xBv, int yBv, int ctuSize);
+    void      intraBCSearchMVCandUpdate(uint32_t sad, int x, int y, uint32_t* sadBestCand, MV* cMVCand);
+    void      updateBVMergeCandLists(int roiWidth, int roiHeight, MV* mvCand, IBC& ibc);
+    int       intraBCSearchMVChromaRefine(Mode& intraBCMode, const CUGeom& cuGeom, int roiWidth, int roiHeight, int cuPelX, int cuPelY, uint32_t* sadBestCand, MV* cMVCand,
+        uint32_t partOffset, int puIdx);
+    static    uint32_t mergeCandLists(MV* dst, uint32_t dn, MV* src, uint32_t sn, bool isSrcQuarPel);
+    uint32_t  getSAD(pixel* ref, int refStride, const pixel* curr, int currStride, int width, int height);
+    bool      predMixedIntraBCInterSearch(Mode& intraBCMode, const CUGeom& cuGeom, bool bChromaMC, PartSize ePartSize, MV* iMVCandList);
+    void      restrictBipredMergeCand(CUData* cu, uint32_t puIdx, MVField(*mvFieldNeighbours)2, uint8_t* interDirNeighbours, uint32_t numValidMergeCand);
+#endif
+
     class PME : public BondedTaskGroup
     {
     public:

 
@@ -286,6 +286,16 @@
     int32_t         m_sliceMaxY;
     int32_t         m_sliceMinY;
 
+    bool            m_vertRestriction;
+
+#if ENABLE_SCC_EXT
+    int             m_ibcEnabled;
+    int             m_numBVs;
+    int             m_numBV16s;
+    MV              m_BVs64;
+    uint32_t        m_lastCandCost;
+#endif
+
 #if DETAILED_CU_STATS
     /* Accumulate CU statistics separately for each frame encoder */
     CUStats         m_statsX265_MAX_FRAME_THREADS;
@@ -309,7 +319,7 @@
     void     encodeIntraInInter(Mode& intraMode, const CUGeom& cuGeom);
 
     // estimation inter prediction (non-skip)
-    void     predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChromaMC, uint32_t masks2);
+    void      predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChromaMC, uint32_t masks2, MV* iMVCandList = NULL);
     void     searchMV(Mode& interMode, int list, int ref, MV& outmv, MV mvp3, int numMvc, MV* mvc);
     // encode residual and compute rd-cost for inter mode
     void     encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom);
@@ -329,6 +339,25 @@
 
     MV getLowresMV(const CUData& cu, const PredictionUnit& pu, int list, int ref);
 
+#if ENABLE_SCC_EXT
+    bool      predIntraBCSearch(Mode& intraBCMode, const CUGeom& cuGeom, bool bChromaMC, PartSize ePartSize, bool testOnlyPred, bool bUse1DSearchFor8x8, IBC& ibc);
+    void      intraBlockCopyEstimate(Mode& intraBCMode, const CUGeom& cuGeom, int puIdx, MV* pred, MV& mv, uint32_t& cost, bool testOnlyPred, bool bUse1DSearchFor8x8, IBC& ibc);
+    void      setIntraSearchRange(Mode& intraBCMode, MV& pred, int puIdx, int roiWidth, int roiHeight, MV& searchRangeLT, MV& searchRangeRB);
+    void      intraPatternSearch(Mode& intraBCMode, const CUGeom& cuGeom, int puIdx, uint32_t partAddr, pixel* refY, int refStride, MV* searchRangeLT, MV* searchRangeRB,
+        MV& mv, uint32_t& cost, int roiwidth, int roiheight, bool testOnlyPred, bool bUse1DSearchFor8x8, IBC& ibc);
+    bool      isValidIntraBCSearchArea(CUData* cu, int predX, int predY, int roiWidth, int roiHeight, int partOffset);
+    bool      isBlockVectorValid(int xPos, int yPos, int width, int height, CUData* pcCU,
+        int xStartInCU, int yStartInCU, int xBv, int yBv, int ctuSize);
+    void      intraBCSearchMVCandUpdate(uint32_t sad, int x, int y, uint32_t* sadBestCand, MV* cMVCand);
+    void      updateBVMergeCandLists(int roiWidth, int roiHeight, MV* mvCand, IBC& ibc);
+    int       intraBCSearchMVChromaRefine(Mode& intraBCMode, const CUGeom& cuGeom, int roiWidth, int roiHeight, int cuPelX, int cuPelY, uint32_t* sadBestCand, MV* cMVCand,
+        uint32_t partOffset, int puIdx);
+    static    uint32_t mergeCandLists(MV* dst, uint32_t dn, MV* src, uint32_t sn, bool isSrcQuarPel);
+    uint32_t  getSAD(pixel* ref, int refStride, const pixel* curr, int currStride, int width, int height);
+    bool      predMixedIntraBCInterSearch(Mode& intraBCMode, const CUGeom& cuGeom, bool bChromaMC, PartSize ePartSize, MV* iMVCandList);
+    void      restrictBipredMergeCand(CUData* cu, uint32_t puIdx, MVField(*mvFieldNeighbours)2, uint8_t* interDirNeighbours, uint32_t numValidMergeCand);
+#endif
+
     class PME : public BondedTaskGroup
     {
     public:
​

x265_3.6.tar.gz/source/encoder/sei.cpp -> x265_4.0.tar.gz/source/encoder/sei.cpp Changed

 
@@ -36,7 +36,7 @@
 
 /* marshal a single SEI message sei, storing the marshalled representation
 * in bitstream bs */
-void SEI::writeSEImessages(Bitstream& bs, const SPS& sps, NalUnitType nalUnitType, NALList& list, int isNested)
+void SEI::writeSEImessages(Bitstream& bs, const SPS& sps, NalUnitType nalUnitType, NALList& list, int isNested, int layer)
 {
     if (!isNested)
         bs.resetBits();
@@ -68,7 +68,7 @@
     {
         if (nalUnitType != NAL_UNIT_UNSPECIFIED)
             bs.writeByteAlignment();
-        list.serialize(nalUnitType, bs, (1 + (nalUnitType == NAL_UNIT_CODED_SLICE_TSA_N)));
+        list.serialize(nalUnitType, bs, layer, (1 + (nalUnitType == NAL_UNIT_CODED_SLICE_TSA_N)));
     }
 }
 
​

x265_3.6.tar.gz/source/encoder/sei.h -> x265_4.0.tar.gz/source/encoder/sei.h Changed

@@ -38,7 +38,7 @@
 public:
     /* SEI users call writeSEImessages() to marshal an SEI to a bitstream.
     * The writeSEImessages() method calls writeSEI() which encodes the header */
-    void writeSEImessages(Bitstream& bs, const SPS& sps, NalUnitType nalUnitType, NALList& list, int isNested);
+    void writeSEImessages(Bitstream& bs, const SPS& sps, NalUnitType nalUnitType, NALList& list, int isNested, int layerId = 0);
     void setSize(uint32_t size);
     static char* base64Decode(char encodedString, int base64EncodeLength);
     virtual ~SEI() {}
@@ -189,6 +189,228 @@
     }
 };
 
+#if ENABLE_ALPHA
+class SEIAlphaChannelInfo : public SEI
+{
+public:
+    SEIAlphaChannelInfo()
+    {
+        m_payloadType = ALPHA_CHANNEL_INFO;
+        m_payloadSize = 0;
+    }
+
+    bool alpha_channel_cancel_flag;
+    void writeSEI(const SPS&)
+    {
+        WRITE_CODE(alpha_channel_cancel_flag, 1, "alpha_channel_cancel_flag");
+        if (!alpha_channel_cancel_flag)
+        {
+            WRITE_CODE(0, 3, "alpha_channel_use_idc");
+            WRITE_CODE(0, 3, "alpha_channel_bit_depth_minus8");
+            WRITE_CODE(0, 9, "alpha_transparent_value");
+            WRITE_CODE(255, 9, "alpha_opaque_value");
+            WRITE_CODE(0, 1, "alpha_channel_incr_flag");
+            WRITE_CODE(0, 1, "alpha_channel_clip_flag");
+        }
+        if (m_bitIf->getNumberOfWrittenBits() % X265_BYTE != 0)
+        {
+            WRITE_FLAG(1, "payload_bit_equal_to_one");
+            while (m_bitIf->getNumberOfWrittenBits() % X265_BYTE != 0)
+            {
+                WRITE_FLAG(0, "payload_bit_equal_to_zero");
+            }
+        }
+    }
+};
+#endif
+
+#if ENABLE_MULTIVIEW
+class SEIThreeDimensionalReferenceDisplaysInfo : public SEI
+{
+public:
+    SEIThreeDimensionalReferenceDisplaysInfo()
+    {
+        m_payloadType = THREE_DIMENSIONAL_REFERENCE_DISPLAYS_INFO;
+        m_payloadSize = 0;
+    }
+
+    int  m_numRefDisplaysMinus1 = 0;
+    bool m_refViewingDistanceFlag = false;
+    bool m_additionalShiftPresentFlag = false;
+    void writeSEI(const SPS&)
+    {
+        WRITE_UVLC(31, "prec_ref_display_width");
+        WRITE_FLAG(m_refViewingDistanceFlag, "ref_viewing_distance_flag");
+        if (m_refViewingDistanceFlag)
+        {
+            WRITE_UVLC(0, "prec_ref_viewing_dist");
+        }
+        WRITE_UVLC(0, "num_ref_displays_minus1");
+        for (int i = 0; i <= m_numRefDisplaysMinus1; i++)
+        {
+            WRITE_UVLC(0, "left_view_id");
+            WRITE_UVLC(1, "right_view_id");
+            WRITE_CODE(0, 6, "exponent_ref_display_width");
+            WRITE_CODE(0, 2, "mantissa_ref_display_width");
+            if (m_refViewingDistanceFlag)
+            {
+                WRITE_CODE(0, 6, "exponent_ref_viewing_distance");
+                WRITE_CODE(0, 1, "mantissa_ref_viewing_distance");
+            }
+            WRITE_FLAG(m_additionalShiftPresentFlag, "additional_shift_present_flag");
+            if (m_additionalShiftPresentFlag)
+            {
+                WRITE_CODE(0, 10, "num_sample_shift_plus512");
+            }
+        }
+        WRITE_FLAG(0, "three_dimensional_reference_displays_extension_flag");
+
+        if (m_bitIf->getNumberOfWrittenBits() % X265_BYTE != 0)
+        {
+            WRITE_FLAG(1, "payload_bit_equal_to_one");
+            while (m_bitIf->getNumberOfWrittenBits() % X265_BYTE != 0)
+            {
+                WRITE_FLAG(0, "payload_bit_equal_to_zero");
+            }
+        }
+    }
+
+};
+
+class SEIMultiviewSceneInfo : public SEI
+{
+public:
+    SEIMultiviewSceneInfo()
+    {
+        m_payloadType = MULTIVIEW_SCENE_INFO;
+        m_payloadSize = 0;
+    }
+    void writeSEI(const SPS&)
+    {
+        WRITE_SVLC(-333, "min_disparity");
+        WRITE_UVLC(2047, "max_disparity_range");
+
+        if (m_bitIf->getNumberOfWrittenBits() % X265_BYTE != 0)
+        {
+            WRITE_FLAG(1, "payload_bit_equal_to_one");
+            while (m_bitIf->getNumberOfWrittenBits() % X265_BYTE != 0)
+            {
+                WRITE_FLAG(0, "payload_bit_equal_to_zero");
+            }
+        }
+    }
+};
+
+class SEIMultiviewAcquisitionInfo : public SEI
+{
+public:
+    SEIMultiviewAcquisitionInfo()
+    {
+        m_payloadType = MULTIVIEW_ACQUISITION_INFO;
+        m_payloadSize = 0;
+    }
+
+    int sign_r33 = { {0,1,0},{1,0,0},{0,1,1} };
+    int exponent_r33 = { {10,20,11},{10,5,11},{2,20,11} };
+    int mantissa_r33 = { {4,9,1},{0,3,4},{3,3,7} };
+    int sign_t13 = { 0,1,0 };
+    int exponent_t13 = { 0,10,5 };
+    int mantissa_t13 = { 1,8,9 };
+    int lenght_mantissa_r33 = { {10,20,11},{10,5,11},{2,20,11} };
+    int length_mantissa_t13 = { 1,10,5 };
+    bool m_intrinsicParamFlag = true;
+    bool m_extrinsicParamFlag = true;
+    bool m_intrinsicParamsEqualFlag = true;
+    void writeSEI(const SPS& sps)
+    {
+        WRITE_FLAG(m_intrinsicParamFlag, "intrinsic_param_flag");
+        WRITE_FLAG(m_extrinsicParamFlag, "extrinsic_param_flag");
+        if (m_intrinsicParamFlag)
+        {
+            WRITE_FLAG(m_intrinsicParamsEqualFlag, "intrinsic_params_equal_flag");
+            WRITE_UVLC(31, "prec_focal_length");
+            WRITE_UVLC(31, "prec_principal_point");
+            WRITE_UVLC(31, "prec_skew_factor");
+
+            for (int i = 0; i <= (m_intrinsicParamsEqualFlag ? 0 : sps.maxViews - 1); i++)
+            {
+                WRITE_FLAG(0, "sign_focal_length_x");
+                WRITE_CODE(0, 6, "exponent_focal_length_x");
+                WRITE_CODE(0, 1, "mantissa_focal_length_x");
+                WRITE_FLAG(0, "sign_focal_length_y");
+                WRITE_CODE(0, 6, "exponent_focal_length_y");
+                WRITE_CODE(0, 1, "mantissa_focal_length_y");
+                WRITE_FLAG(0, "sign_principal_point_x");
+                WRITE_CODE(0, 6, "exponent_principal_point_x");
+                WRITE_CODE(0, 1, "mantissa_principal_point_x");
+                WRITE_FLAG(0, "sign_principal_point_y");
+                WRITE_CODE(0, 6, "exponent_principal_point_y");
+                WRITE_CODE(0, 1, "mantissa_principal_point_y");
+                WRITE_FLAG(0, "sign_skew_factor");
+                WRITE_CODE(0, 6, "exponent_skew_factor");
+                WRITE_CODE(0, 1, "mantissa_skew_factor");
+            }
+        }
+
+        if (m_extrinsicParamFlag)
+        {
+            WRITE_UVLC(31, "prec_rotation_param");
+            WRITE_UVLC(31, "prec_translation_param");
+            for (int i = 0; i <= 0; i++)
+            {
+                for (int j = 0; j <= 2; j++)  /* row */
+                {
+                    for (int k = 0; k <= 2; k++)  /* column */
+                    {
+                        WRITE_FLAG(sign_rjk, "sign_r");
+                        WRITE_CODE(exponent_rjk, 6, "exponent_r");
+                        WRITE_CODE(mantissa_rjk, lenght_mantissa_rjk, "mantissa_r");
+                    }
+                    WRITE_FLAG(sign_tij, "sign_t");
+                    WRITE_CODE(exponent_tij, 6, "exponent_t");
+                    WRITE_CODE(mantissa_tij, length_mantissa_tij, "mantissa_t");
+                }
+            }
+        }
+        if (m_bitIf->getNumberOfWrittenBits() % X265_BYTE != 0)
+        {
+            WRITE_FLAG(1, "payload_bit_equal_to_one");
+            while (m_bitIf->getNumberOfWrittenBits() % X265_BYTE != 0)

 
@@ -38,7 +38,7 @@
 public:
     /* SEI users call writeSEImessages() to marshal an SEI to a bitstream.
     * The writeSEImessages() method calls writeSEI() which encodes the header */
-    void writeSEImessages(Bitstream& bs, const SPS& sps, NalUnitType nalUnitType, NALList& list, int isNested);
+    void writeSEImessages(Bitstream& bs, const SPS& sps, NalUnitType nalUnitType, NALList& list, int isNested, int layerId = 0);
     void setSize(uint32_t size);
     static char* base64Decode(char encodedString, int base64EncodeLength);
     virtual ~SEI() {}
@@ -189,6 +189,228 @@
     }
 };
 
+#if ENABLE_ALPHA
+class SEIAlphaChannelInfo : public SEI
+{
+public:
+    SEIAlphaChannelInfo()
+    {
+        m_payloadType = ALPHA_CHANNEL_INFO;
+        m_payloadSize = 0;
+    }
+
+    bool alpha_channel_cancel_flag;
+    void writeSEI(const SPS&)
+    {
+        WRITE_CODE(alpha_channel_cancel_flag, 1, "alpha_channel_cancel_flag");
+        if (!alpha_channel_cancel_flag)
+        {
+            WRITE_CODE(0, 3, "alpha_channel_use_idc");
+            WRITE_CODE(0, 3, "alpha_channel_bit_depth_minus8");
+            WRITE_CODE(0, 9, "alpha_transparent_value");
+            WRITE_CODE(255, 9, "alpha_opaque_value");
+            WRITE_CODE(0, 1, "alpha_channel_incr_flag");
+            WRITE_CODE(0, 1, "alpha_channel_clip_flag");
+        }
+        if (m_bitIf->getNumberOfWrittenBits() % X265_BYTE != 0)
+        {
+            WRITE_FLAG(1, "payload_bit_equal_to_one");
+            while (m_bitIf->getNumberOfWrittenBits() % X265_BYTE != 0)
+            {
+                WRITE_FLAG(0, "payload_bit_equal_to_zero");
+            }
+        }
+    }
+};
+#endif
+
+#if ENABLE_MULTIVIEW
+class SEIThreeDimensionalReferenceDisplaysInfo : public SEI
+{
+public:
+    SEIThreeDimensionalReferenceDisplaysInfo()
+    {
+        m_payloadType = THREE_DIMENSIONAL_REFERENCE_DISPLAYS_INFO;
+        m_payloadSize = 0;
+    }
+
+    int  m_numRefDisplaysMinus1 = 0;
+    bool m_refViewingDistanceFlag = false;
+    bool m_additionalShiftPresentFlag = false;
+    void writeSEI(const SPS&)
+    {
+        WRITE_UVLC(31, "prec_ref_display_width");
+        WRITE_FLAG(m_refViewingDistanceFlag, "ref_viewing_distance_flag");
+        if (m_refViewingDistanceFlag)
+        {
+            WRITE_UVLC(0, "prec_ref_viewing_dist");
+        }
+        WRITE_UVLC(0, "num_ref_displays_minus1");
+        for (int i = 0; i <= m_numRefDisplaysMinus1; i++)
+        {
+            WRITE_UVLC(0, "left_view_id");
+            WRITE_UVLC(1, "right_view_id");
+            WRITE_CODE(0, 6, "exponent_ref_display_width");
+            WRITE_CODE(0, 2, "mantissa_ref_display_width");
+            if (m_refViewingDistanceFlag)
+            {
+                WRITE_CODE(0, 6, "exponent_ref_viewing_distance");
+                WRITE_CODE(0, 1, "mantissa_ref_viewing_distance");
+            }
+            WRITE_FLAG(m_additionalShiftPresentFlag, "additional_shift_present_flag");
+            if (m_additionalShiftPresentFlag)
+            {
+                WRITE_CODE(0, 10, "num_sample_shift_plus512");
+            }
+        }
+        WRITE_FLAG(0, "three_dimensional_reference_displays_extension_flag");
+
+        if (m_bitIf->getNumberOfWrittenBits() % X265_BYTE != 0)
+        {
+            WRITE_FLAG(1, "payload_bit_equal_to_one");
+            while (m_bitIf->getNumberOfWrittenBits() % X265_BYTE != 0)
+            {
+                WRITE_FLAG(0, "payload_bit_equal_to_zero");
+            }
+        }
+    }
+
+};
+
+class SEIMultiviewSceneInfo : public SEI
+{
+public:
+    SEIMultiviewSceneInfo()
+    {
+        m_payloadType = MULTIVIEW_SCENE_INFO;
+        m_payloadSize = 0;
+    }
+    void writeSEI(const SPS&)
+    {
+        WRITE_SVLC(-333, "min_disparity");
+        WRITE_UVLC(2047, "max_disparity_range");
+
+        if (m_bitIf->getNumberOfWrittenBits() % X265_BYTE != 0)
+        {
+            WRITE_FLAG(1, "payload_bit_equal_to_one");
+            while (m_bitIf->getNumberOfWrittenBits() % X265_BYTE != 0)
+            {
+                WRITE_FLAG(0, "payload_bit_equal_to_zero");
+            }
+        }
+    }
+};
+
+class SEIMultiviewAcquisitionInfo : public SEI
+{
+public:
+    SEIMultiviewAcquisitionInfo()
+    {
+        m_payloadType = MULTIVIEW_ACQUISITION_INFO;
+        m_payloadSize = 0;
+    }
+
+    int sign_r33 = { {0,1,0},{1,0,0},{0,1,1} };
+    int exponent_r33 = { {10,20,11},{10,5,11},{2,20,11} };
+    int mantissa_r33 = { {4,9,1},{0,3,4},{3,3,7} };
+    int sign_t13 = { 0,1,0 };
+    int exponent_t13 = { 0,10,5 };
+    int mantissa_t13 = { 1,8,9 };
+    int lenght_mantissa_r33 = { {10,20,11},{10,5,11},{2,20,11} };
+    int length_mantissa_t13 = { 1,10,5 };
+    bool m_intrinsicParamFlag = true;
+    bool m_extrinsicParamFlag = true;
+    bool m_intrinsicParamsEqualFlag = true;
+    void writeSEI(const SPS& sps)
+    {
+        WRITE_FLAG(m_intrinsicParamFlag, "intrinsic_param_flag");
+        WRITE_FLAG(m_extrinsicParamFlag, "extrinsic_param_flag");
+        if (m_intrinsicParamFlag)
+        {
+            WRITE_FLAG(m_intrinsicParamsEqualFlag, "intrinsic_params_equal_flag");
+            WRITE_UVLC(31, "prec_focal_length");
+            WRITE_UVLC(31, "prec_principal_point");
+            WRITE_UVLC(31, "prec_skew_factor");
+
+            for (int i = 0; i <= (m_intrinsicParamsEqualFlag ? 0 : sps.maxViews - 1); i++)
+            {
+                WRITE_FLAG(0, "sign_focal_length_x");
+                WRITE_CODE(0, 6, "exponent_focal_length_x");
+                WRITE_CODE(0, 1, "mantissa_focal_length_x");
+                WRITE_FLAG(0, "sign_focal_length_y");
+                WRITE_CODE(0, 6, "exponent_focal_length_y");
+                WRITE_CODE(0, 1, "mantissa_focal_length_y");
+                WRITE_FLAG(0, "sign_principal_point_x");
+                WRITE_CODE(0, 6, "exponent_principal_point_x");
+                WRITE_CODE(0, 1, "mantissa_principal_point_x");
+                WRITE_FLAG(0, "sign_principal_point_y");
+                WRITE_CODE(0, 6, "exponent_principal_point_y");
+                WRITE_CODE(0, 1, "mantissa_principal_point_y");
+                WRITE_FLAG(0, "sign_skew_factor");
+                WRITE_CODE(0, 6, "exponent_skew_factor");
+                WRITE_CODE(0, 1, "mantissa_skew_factor");
+            }
+        }
+
+        if (m_extrinsicParamFlag)
+        {
+            WRITE_UVLC(31, "prec_rotation_param");
+            WRITE_UVLC(31, "prec_translation_param");
+            for (int i = 0; i <= 0; i++)
+            {
+                for (int j = 0; j <= 2; j++)  /* row */
+                {
+                    for (int k = 0; k <= 2; k++)  /* column */
+                    {
+                        WRITE_FLAG(sign_rjk, "sign_r");
+                        WRITE_CODE(exponent_rjk, 6, "exponent_r");
+                        WRITE_CODE(mantissa_rjk, lenght_mantissa_rjk, "mantissa_r");
+                    }
+                    WRITE_FLAG(sign_tij, "sign_t");
+                    WRITE_CODE(exponent_tij, 6, "exponent_t");
+                    WRITE_CODE(mantissa_tij, length_mantissa_tij, "mantissa_t");
+                }
+            }
+        }
+        if (m_bitIf->getNumberOfWrittenBits() % X265_BYTE != 0)
+        {
+            WRITE_FLAG(1, "payload_bit_equal_to_one");
+            while (m_bitIf->getNumberOfWrittenBits() % X265_BYTE != 0)
​

x265_3.6.tar.gz/source/encoder/slicetype.cpp -> x265_4.0.tar.gz/source/encoder/slicetype.cpp Changed

@@ -1324,7 +1324,7 @@
     int l0poc = slice->m_rps.numberOfNegativePictures ? slice->m_refPOCList00 : -1;
     int l1poc = slice->m_refPOCList10;
 
-    switch (slice->m_sliceType)
+    switch (slice->m_origSliceType)
     {
     case I_SLICE:
         framesp0 = &curFrame->m_lowres;
@@ -4160,9 +4160,9 @@
         /* ME will never return a cost larger than the cost @MVP, so we do not
          * have to check that ME cost is more than the estimated merge cost */
         if(!hme)
-            fencCost = tld.me.motionEstimate(fref, mvmin, mvmax, mvp, 0, NULL, searchRange, *fencMV, m_lookahead.m_param->maxSlices);
+            fencCost = tld.me.motionEstimate(fref, mvmin, mvmax, mvp, 0, NULL, searchRange, *fencMV, m_lookahead.m_param->maxSlices, 0);
         else
-            fencCost = tld.me.motionEstimate(fref, mvmin, mvmax, mvp, 0, NULL, searchRange, *fencMV, m_lookahead.m_param->maxSlices, fref->lowerResPlane0);
+            fencCost = tld.me.motionEstimate(fref, mvmin, mvmax, mvp, 0, NULL, searchRange, *fencMV, m_lookahead.m_param->maxSlices, 0, fref->lowerResPlane0);
         if (skipCost < 64 && skipCost < fencCost && bBidir)
         {
             fencCost = skipCost;

 
@@ -1324,7 +1324,7 @@
     int l0poc = slice->m_rps.numberOfNegativePictures ? slice->m_refPOCList00 : -1;
     int l1poc = slice->m_refPOCList10;
 
-    switch (slice->m_sliceType)
+    switch (slice->m_origSliceType)
     {
     case I_SLICE:
         framesp0 = &curFrame->m_lowres;
@@ -4160,9 +4160,9 @@
         /* ME will never return a cost larger than the cost @MVP, so we do not
          * have to check that ME cost is more than the estimated merge cost */
         if(!hme)
-            fencCost = tld.me.motionEstimate(fref, mvmin, mvmax, mvp, 0, NULL, searchRange, *fencMV, m_lookahead.m_param->maxSlices);
+            fencCost = tld.me.motionEstimate(fref, mvmin, mvmax, mvp, 0, NULL, searchRange, *fencMV, m_lookahead.m_param->maxSlices, 0);
         else
-            fencCost = tld.me.motionEstimate(fref, mvmin, mvmax, mvp, 0, NULL, searchRange, *fencMV, m_lookahead.m_param->maxSlices, fref->lowerResPlane0);
+            fencCost = tld.me.motionEstimate(fref, mvmin, mvmax, mvp, 0, NULL, searchRange, *fencMV, m_lookahead.m_param->maxSlices, 0, fref->lowerResPlane0);
         if (skipCost < 64 && skipCost < fencCost && bBidir)
         {
             fencCost = skipCost;
​

x265_3.6.tar.gz/source/encoder/weightPrediction.cpp -> x265_4.0.tar.gz/source/encoder/weightPrediction.cpp Changed

 
@@ -491,8 +491,14 @@
         lumaDenom = weights0.log2WeightDenom;
         chromaDenom = weights1.log2WeightDenom;
 
+        int numIdx = slice.m_numRefIdxlist;
+#if ENABLE_SCC_EXT
+        if (!list && param.bEnableSCC)
+            numIdx--;
+#endif
+
         /* reset weight states */
-        for (int ref = 1; ref < slice.m_numRefIdxlist; ref++)
+        for (int ref = 1; ref < numIdx; ref++)
         {
             SET_WEIGHT(wplistref0, false, 1 << lumaDenom, lumaDenom, 0);
             SET_WEIGHT(wplistref1, false, 1 << chromaDenom, chromaDenom, 0);
​

x265_3.6.tar.gz/source/input/input.cpp -> x265_4.0.tar.gz/source/input/input.cpp Changed

 
@@ -27,12 +27,12 @@
 
 using namespace X265_NS;
 
-InputFile* InputFile::open(InputFileInfo& info, bool bForceY4m)
+InputFile* InputFile::open(InputFileInfo& info, bool bForceY4m, bool alpha, int format)
 {
     const char * s = strrchr(info.filename, '.');
 
     if (bForceY4m || (s && !strcmp(s, ".y4m")))
-        return new Y4MInput(info);
+        return new Y4MInput(info, alpha, format);
     else
-        return new YUVInput(info);
+        return new YUVInput(info, alpha, format);
 }
​

x265_3.6.tar.gz/source/input/input.h -> x265_4.0.tar.gz/source/input/input.h Changed

 
@@ -66,7 +66,7 @@
 
     InputFile()           {}
 
-    static InputFile* open(InputFileInfo& info, bool bForceY4m);
+    static InputFile* open(InputFileInfo& info, bool bForceY4m, bool alpha, int format);
 
     virtual void startReader() = 0;
 
​

x265_3.6.tar.gz/source/input/y4m.cpp -> x265_4.0.tar.gz/source/input/y4m.cpp Changed

@@ -40,13 +40,14 @@
 using namespace X265_NS;
 using namespace std;
 static const char header = {'F','R','A','M','E'};
-Y4MInput::Y4MInput(InputFileInfo& info)
+Y4MInput::Y4MInput(InputFileInfo& info, bool alpha, int format)
 {
     for (int i = 0; i < QUEUE_SIZE; i++)
         bufi = NULL;
 
     threadActive = false;
     colorSpace = info.csp;
+    alphaAvailable = alpha;
     sarWidth = info.sarWidth;
     sarHeight = info.sarHeight;
     width = info.width;
@@ -68,11 +69,13 @@
         ifs = x265_fopen(info.filename, "rb");
     if (ifs && !ferror(ifs) && parseHeader())
     {
+        if (format == 1) width /= 2;
+        if (format == 2) height /= 2;
         int pixelbytes = depth > 8 ? 2 : 1;
-        for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
+        for (int i = 0; i < x265_cli_cspscolorSpace.planes + alphaAvailable; i++)
         {
-            int stride = (width >> x265_cli_cspscolorSpace.widthi) * pixelbytes;
-            framesize += (stride * (height >> x265_cli_cspscolorSpace.heighti));
+            int stride = ((width * (format == 1 ? 2 : 1)) >> x265_cli_cspscolorSpace.widthi) * pixelbytes;
+            framesize += (stride * ((height * (format == 2 ? 2 : 1)) >> x265_cli_cspscolorSpace.heighti));
         }
 
         threadActive = true;
@@ -390,12 +393,19 @@
         pic.height = height;
         pic.width = width;
         pic.colorSpace = colorSpace;
-        pic.stride0 = width * pixelbytes;
+        pic.stride0 = width * pixelbytes * (pic.format == 1 ? 2 : 1);
         pic.stride1 = pic.stride0 >> x265_cli_cspscolorSpace.width1;
         pic.stride2 = pic.stride0 >> x265_cli_cspscolorSpace.width2;
         pic.planes0 = bufread % QUEUE_SIZE;
-        pic.planes1 = (char*)pic.planes0 + pic.stride0 * height;
-        pic.planes2 = (char*)pic.planes1 + pic.stride1 * (height >> x265_cli_cspscolorSpace.height1);
+        pic.planes1 = (char*)pic.planes0 + pic.stride0 * (height * (pic.format == 2 ? 2 : 1));
+        pic.planes2 = (char*)pic.planes1 + pic.stride1 * ((height * (pic.format == 2 ? 2 : 1)) >> x265_cli_cspscolorSpace.height1);
+#if ENABLE_ALPHA
+        if (alphaAvailable)
+        {
+            pic.stride3 = pic.stride0 >> x265_cli_cspscolorSpace.width3;
+            pic.planes3 = (char*)pic.planes2 + pic.stride2 * (height >> x265_cli_cspscolorSpace.height2);
+        }
+#endif
         readCount.incr();
         return true;
     }

 
@@ -40,13 +40,14 @@
 using namespace X265_NS;
 using namespace std;
 static const char header = {'F','R','A','M','E'};
-Y4MInput::Y4MInput(InputFileInfo& info)
+Y4MInput::Y4MInput(InputFileInfo& info, bool alpha, int format)
 {
     for (int i = 0; i < QUEUE_SIZE; i++)
         bufi = NULL;
 
     threadActive = false;
     colorSpace = info.csp;
+    alphaAvailable = alpha;
     sarWidth = info.sarWidth;
     sarHeight = info.sarHeight;
     width = info.width;
@@ -68,11 +69,13 @@
         ifs = x265_fopen(info.filename, "rb");
     if (ifs && !ferror(ifs) && parseHeader())
     {
+        if (format == 1) width /= 2;
+        if (format == 2) height /= 2;
         int pixelbytes = depth > 8 ? 2 : 1;
-        for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
+        for (int i = 0; i < x265_cli_cspscolorSpace.planes + alphaAvailable; i++)
         {
-            int stride = (width >> x265_cli_cspscolorSpace.widthi) * pixelbytes;
-            framesize += (stride * (height >> x265_cli_cspscolorSpace.heighti));
+            int stride = ((width * (format == 1 ? 2 : 1)) >> x265_cli_cspscolorSpace.widthi) * pixelbytes;
+            framesize += (stride * ((height * (format == 2 ? 2 : 1)) >> x265_cli_cspscolorSpace.heighti));
         }
 
         threadActive = true;
@@ -390,12 +393,19 @@
         pic.height = height;
         pic.width = width;
         pic.colorSpace = colorSpace;
-        pic.stride0 = width * pixelbytes;
+        pic.stride0 = width * pixelbytes * (pic.format == 1 ? 2 : 1);
         pic.stride1 = pic.stride0 >> x265_cli_cspscolorSpace.width1;
         pic.stride2 = pic.stride0 >> x265_cli_cspscolorSpace.width2;
         pic.planes0 = bufread % QUEUE_SIZE;
-        pic.planes1 = (char*)pic.planes0 + pic.stride0 * height;
-        pic.planes2 = (char*)pic.planes1 + pic.stride1 * (height >> x265_cli_cspscolorSpace.height1);
+        pic.planes1 = (char*)pic.planes0 + pic.stride0 * (height * (pic.format == 2 ? 2 : 1));
+        pic.planes2 = (char*)pic.planes1 + pic.stride1 * ((height * (pic.format == 2 ? 2 : 1)) >> x265_cli_cspscolorSpace.height1);
+#if ENABLE_ALPHA
+        if (alphaAvailable)
+        {
+            pic.stride3 = pic.stride0 >> x265_cli_cspscolorSpace.width3;
+            pic.planes3 = (char*)pic.planes2 + pic.stride2 * (height >> x265_cli_cspscolorSpace.height2);
+        }
+#endif
         readCount.incr();
         return true;
     }
​

x265_3.6.tar.gz/source/input/y4m.h -> x265_4.0.tar.gz/source/input/y4m.h Changed

 
@@ -55,6 +55,8 @@
 
     int colorSpace;
 
+    bool alphaAvailable;
+
     bool threadActive;
 
     ThreadSafeInteger readCount;
@@ -69,7 +71,7 @@
 
 public:
 
-    Y4MInput(InputFileInfo& info);
+    Y4MInput(InputFileInfo& info, bool alpha, int format);
 
     virtual ~Y4MInput();
     void release();
​

x265_3.6.tar.gz/source/input/yuv.cpp -> x265_4.0.tar.gz/source/input/yuv.cpp Changed

@@ -40,7 +40,7 @@
 using namespace X265_NS;
 using namespace std;
 
-YUVInput::YUVInput(InputFileInfo& info)
+YUVInput::YUVInput(InputFileInfo& info, bool alpha, int format)
 {
     for (int i = 0; i < QUEUE_SIZE; i++)
         bufi = NULL;
@@ -49,15 +49,16 @@
     width = info.width;
     height = info.height;
     colorSpace = info.csp;
+    alphaAvailable = alpha;
     threadActive = false;
     ifs = NULL;
 
     uint32_t pixelbytes = depth > 8 ? 2 : 1;
     framesize = 0;
-    for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
+    for (int i = 0; i < x265_cli_cspscolorSpace.planes + alphaAvailable; i++)
     {
-        uint32_t w = width >> x265_cli_cspscolorSpace.widthi;
-        uint32_t h = height >> x265_cli_cspscolorSpace.heighti;
+        int32_t w = (width * (format == 1 ? 2 : 1)) >> x265_cli_cspscolorSpace.widthi;
+        uint32_t h = (height * (format == 2 ? 2 : 1)) >> x265_cli_cspscolorSpace.heighti;
         framesize += w * h * pixelbytes;
     }
 
@@ -205,12 +206,19 @@
         pic.framesize = framesize;
         pic.height = height;
         pic.width = width;
-        pic.stride0 = width * pixelbytes;
+        pic.stride0 = width * pixelbytes * (pic.format == 1 ? 2 : 1);
         pic.stride1 = pic.stride0 >> x265_cli_cspscolorSpace.width1;
         pic.stride2 = pic.stride0 >> x265_cli_cspscolorSpace.width2;
         pic.planes0 = bufread % QUEUE_SIZE;
-        pic.planes1 = (char*)pic.planes0 + pic.stride0 * height;
-        pic.planes2 = (char*)pic.planes1 + pic.stride1 * (height >> x265_cli_cspscolorSpace.height1);
+        pic.planes1 = (char*)pic.planes0 + pic.stride0 * (height * (pic.format == 2 ? 2 : 1));
+        pic.planes2 = (char*)pic.planes1 + pic.stride1 * ((height * (pic.format == 2 ? 2 : 1)) >> x265_cli_cspscolorSpace.height1);
+#if ENABLE_ALPHA
+        if (alphaAvailable)
+        {
+            pic.stride3 = pic.stride0 >> x265_cli_cspscolorSpace.width3;
+            pic.planes3 = (char*)pic.planes2 + pic.stride2 * (height >> x265_cli_cspscolorSpace.height2);
+        }
+#endif
         readCount.incr();
         return true;
     }

 
@@ -40,7 +40,7 @@
 using namespace X265_NS;
 using namespace std;
 
-YUVInput::YUVInput(InputFileInfo& info)
+YUVInput::YUVInput(InputFileInfo& info, bool alpha, int format)
 {
     for (int i = 0; i < QUEUE_SIZE; i++)
         bufi = NULL;
@@ -49,15 +49,16 @@
     width = info.width;
     height = info.height;
     colorSpace = info.csp;
+    alphaAvailable = alpha;
     threadActive = false;
     ifs = NULL;
 
     uint32_t pixelbytes = depth > 8 ? 2 : 1;
     framesize = 0;
-    for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
+    for (int i = 0; i < x265_cli_cspscolorSpace.planes + alphaAvailable; i++)
     {
-        uint32_t w = width >> x265_cli_cspscolorSpace.widthi;
-        uint32_t h = height >> x265_cli_cspscolorSpace.heighti;
+        int32_t w = (width * (format == 1 ? 2 : 1)) >> x265_cli_cspscolorSpace.widthi;
+        uint32_t h = (height * (format == 2 ? 2 : 1)) >> x265_cli_cspscolorSpace.heighti;
         framesize += w * h * pixelbytes;
     }
 
@@ -205,12 +206,19 @@
         pic.framesize = framesize;
         pic.height = height;
         pic.width = width;
-        pic.stride0 = width * pixelbytes;
+        pic.stride0 = width * pixelbytes * (pic.format == 1 ? 2 : 1);
         pic.stride1 = pic.stride0 >> x265_cli_cspscolorSpace.width1;
         pic.stride2 = pic.stride0 >> x265_cli_cspscolorSpace.width2;
         pic.planes0 = bufread % QUEUE_SIZE;
-        pic.planes1 = (char*)pic.planes0 + pic.stride0 * height;
-        pic.planes2 = (char*)pic.planes1 + pic.stride1 * (height >> x265_cli_cspscolorSpace.height1);
+        pic.planes1 = (char*)pic.planes0 + pic.stride0 * (height * (pic.format == 2 ? 2 : 1));
+        pic.planes2 = (char*)pic.planes1 + pic.stride1 * ((height * (pic.format == 2 ? 2 : 1)) >> x265_cli_cspscolorSpace.height1);
+#if ENABLE_ALPHA
+        if (alphaAvailable)
+        {
+            pic.stride3 = pic.stride0 >> x265_cli_cspscolorSpace.width3;
+            pic.planes3 = (char*)pic.planes2 + pic.stride2 * (height >> x265_cli_cspscolorSpace.height2);
+        }
+#endif
         readCount.incr();
         return true;
     }
​

x265_3.6.tar.gz/source/input/yuv.h -> x265_4.0.tar.gz/source/input/yuv.h Changed

 
@@ -47,6 +47,8 @@
 
     uint32_t framesize;
 
+    bool alphaAvailable;
+
     bool threadActive;
 
     ThreadSafeInteger readCount;
@@ -61,7 +63,7 @@
 
 public:
 
-    YUVInput(InputFileInfo& info);
+    YUVInput(InputFileInfo& info, bool alpha, int format);
 
     virtual ~YUVInput();
     void release();
​

x265_3.6.tar.gz/source/test/ipfilterharness.cpp -> x265_4.0.tar.gz/source/test/ipfilterharness.cpp Changed

@@ -67,7 +67,7 @@
     {
         int index = i % TEST_CASES;
 
-        for (int coeffIdx = 0; coeffIdx < 8; coeffIdx++)
+        for (int coeffIdx = 1; coeffIdx < 8; coeffIdx++)
         {
             rand_srcStride = rand() % 100 + 2;
             rand_dstStride = rand() % 100 + 64;
@@ -102,7 +102,7 @@
     {
         int index = i % TEST_CASES;
 
-        for (int coeffIdx = 0; coeffIdx < 8; coeffIdx++)
+        for (int coeffIdx = 1; coeffIdx < 8; coeffIdx++)
         {
             rand_srcStride = rand() % 100;
             rand_dstStride = rand() % 100 + 64;
@@ -144,7 +144,7 @@
     {
         int index = i % TEST_CASES;
 
-        for (int coeffIdx = 0; coeffIdx < 8; coeffIdx++)
+        for (int coeffIdx = 1; coeffIdx < 8; coeffIdx++)
         {
             // 0 : Interpolate W x H, 1 : Interpolate W x (H + 7)
             for (int isRowExt = 0; isRowExt < 2; isRowExt++)
@@ -185,7 +185,7 @@
     {
         int index = i % TEST_CASES;
 
-        for (int coeffIdx = 0; coeffIdx < 8; coeffIdx++)
+        for (int coeffIdx = 1; coeffIdx < 8; coeffIdx++)
         {
             rand_srcStride = rand() % 100;
             rand_dstStride = rand() % 100 + 64;
@@ -220,7 +220,7 @@
     {
         int index = i % TEST_CASES;
 
-        for (int coeffIdx = 0; coeffIdx < 8; coeffIdx++)
+        for (int coeffIdx = 1; coeffIdx < 8; coeffIdx++)
         {
             rand_srcStride = rand() % 100;
             rand_dstStride = rand() % 100 + 64;
@@ -255,7 +255,7 @@
     {
         int index = i % TEST_CASES;
 
-        for (int coeffIdx = 0; coeffIdx < 4; coeffIdx++)
+        for (int coeffIdx = 1; coeffIdx < 4; coeffIdx++)
         {
             rand_srcStride = rand() % 100;
             rand_dstStride = rand() % 100 + 64;
@@ -290,7 +290,7 @@
     {
         int index = i % TEST_CASES;
 
-        for (int coeffIdx = 0; coeffIdx < 4; coeffIdx++)
+        for (int coeffIdx = 1; coeffIdx < 4; coeffIdx++)
         {
             rand_srcStride = rand() % 100;
             rand_dstStride = rand() % 100 + 64;
@@ -325,7 +325,7 @@
     {
         int index = i % TEST_CASES;
 
-        for (int coeffIdx = 0; coeffIdx < 4; coeffIdx++)
+        for (int coeffIdx = 1; coeffIdx < 4; coeffIdx++)
         {
             // 0 : Interpolate W x H, 1 : Interpolate W x (H + 7)
             for (int isRowExt = 0; isRowExt < 2; isRowExt++)
@@ -366,7 +366,7 @@
     {
         int index = i % TEST_CASES;
 
-        for (int coeffIdx = 0; coeffIdx < 4; coeffIdx++)
+        for (int coeffIdx = 1; coeffIdx < 4; coeffIdx++)
         {
             rand_srcStride = rand() % 100;
             rand_dstStride = rand() % 100 + 64;
@@ -401,7 +401,7 @@
     {
         int index = i % TEST_CASES;
 
-        for (int coeffIdx = 0; coeffIdx < 4; coeffIdx++)
+        for (int coeffIdx = 1; coeffIdx < 4; coeffIdx++)
         {
             rand_srcStride = rand() % 100;
             rand_dstStride = rand() % 100 + 64;
@@ -436,9 +436,9 @@
     {
         int index = i % TEST_CASES;
 
-        for (int coeffIdxX = 0; coeffIdxX < 4; coeffIdxX++)
+        for (int coeffIdxX = 1; coeffIdxX < 4; coeffIdxX++)
         {
-            for (int coeffIdxY = 0; coeffIdxY < 4; coeffIdxY++)
+            for (int coeffIdxY = 1; coeffIdxY < 4; coeffIdxY++)
             {
                 rand_srcStride = rand() % 100;
                 rand_dstStride = rand() % 100 + 64;

 
@@ -67,7 +67,7 @@
     {
         int index = i % TEST_CASES;
 
-        for (int coeffIdx = 0; coeffIdx < 8; coeffIdx++)
+        for (int coeffIdx = 1; coeffIdx < 8; coeffIdx++)
         {
             rand_srcStride = rand() % 100 + 2;
             rand_dstStride = rand() % 100 + 64;
@@ -102,7 +102,7 @@
     {
         int index = i % TEST_CASES;
 
-        for (int coeffIdx = 0; coeffIdx < 8; coeffIdx++)
+        for (int coeffIdx = 1; coeffIdx < 8; coeffIdx++)
         {
             rand_srcStride = rand() % 100;
             rand_dstStride = rand() % 100 + 64;
@@ -144,7 +144,7 @@
     {
         int index = i % TEST_CASES;
 
-        for (int coeffIdx = 0; coeffIdx < 8; coeffIdx++)
+        for (int coeffIdx = 1; coeffIdx < 8; coeffIdx++)
         {
             // 0 : Interpolate W x H, 1 : Interpolate W x (H + 7)
             for (int isRowExt = 0; isRowExt < 2; isRowExt++)
@@ -185,7 +185,7 @@
     {
         int index = i % TEST_CASES;
 
-        for (int coeffIdx = 0; coeffIdx < 8; coeffIdx++)
+        for (int coeffIdx = 1; coeffIdx < 8; coeffIdx++)
         {
             rand_srcStride = rand() % 100;
             rand_dstStride = rand() % 100 + 64;
@@ -220,7 +220,7 @@
     {
         int index = i % TEST_CASES;
 
-        for (int coeffIdx = 0; coeffIdx < 8; coeffIdx++)
+        for (int coeffIdx = 1; coeffIdx < 8; coeffIdx++)
         {
             rand_srcStride = rand() % 100;
             rand_dstStride = rand() % 100 + 64;
@@ -255,7 +255,7 @@
     {
         int index = i % TEST_CASES;
 
-        for (int coeffIdx = 0; coeffIdx < 4; coeffIdx++)
+        for (int coeffIdx = 1; coeffIdx < 4; coeffIdx++)
         {
             rand_srcStride = rand() % 100;
             rand_dstStride = rand() % 100 + 64;
@@ -290,7 +290,7 @@
     {
         int index = i % TEST_CASES;
 
-        for (int coeffIdx = 0; coeffIdx < 4; coeffIdx++)
+        for (int coeffIdx = 1; coeffIdx < 4; coeffIdx++)
         {
             rand_srcStride = rand() % 100;
             rand_dstStride = rand() % 100 + 64;
@@ -325,7 +325,7 @@
     {
         int index = i % TEST_CASES;
 
-        for (int coeffIdx = 0; coeffIdx < 4; coeffIdx++)
+        for (int coeffIdx = 1; coeffIdx < 4; coeffIdx++)
         {
             // 0 : Interpolate W x H, 1 : Interpolate W x (H + 7)
             for (int isRowExt = 0; isRowExt < 2; isRowExt++)
@@ -366,7 +366,7 @@
     {
         int index = i % TEST_CASES;
 
-        for (int coeffIdx = 0; coeffIdx < 4; coeffIdx++)
+        for (int coeffIdx = 1; coeffIdx < 4; coeffIdx++)
         {
             rand_srcStride = rand() % 100;
             rand_dstStride = rand() % 100 + 64;
@@ -401,7 +401,7 @@
     {
         int index = i % TEST_CASES;
 
-        for (int coeffIdx = 0; coeffIdx < 4; coeffIdx++)
+        for (int coeffIdx = 1; coeffIdx < 4; coeffIdx++)
         {
             rand_srcStride = rand() % 100;
             rand_dstStride = rand() % 100 + 64;
@@ -436,9 +436,9 @@
     {
         int index = i % TEST_CASES;
 
-        for (int coeffIdxX = 0; coeffIdxX < 4; coeffIdxX++)
+        for (int coeffIdxX = 1; coeffIdxX < 4; coeffIdxX++)
         {
-            for (int coeffIdxY = 0; coeffIdxY < 4; coeffIdxY++)
+            for (int coeffIdxY = 1; coeffIdxY < 4; coeffIdxY++)
             {
                 rand_srcStride = rand() % 100;
                 rand_dstStride = rand() % 100 + 64;
​

x265_3.6.tar.gz/source/test/mbdstharness.cpp -> x265_4.0.tar.gz/source/test/mbdstharness.cpp Changed

 
@@ -260,8 +260,14 @@
         uint32_t optReturnValue = 0;
         uint32_t refReturnValue = 0;
 
-        int bits = rand() % 32;
-        int valueToAdd = rand() % (1 << bits);
+        int log2TrSize = rand() % 4 + 2;
+        const int qp = rand() % (QP_MAX_SPEC + QP_BD_OFFSET + 1);
+        const int per = qp / 6;
+        const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize;
+
+        /* Right shift of non-RDOQ quantizer level = (coeff*Q + offset)>>q_bits */
+        int bits = QUANT_SHIFT + per + transformShift;
+        int valueToAdd = (1 << (bits - 1));
         int cmp_size = sizeof(short) * height * width;
         int numCoeff = height * width;
 
​

x265_3.6.tar.gz/source/test/pixelharness.cpp -> x265_4.0.tar.gz/source/test/pixelharness.cpp Changed

@@ -1373,8 +1373,7 @@
         ref(sbuf2 + 1, pbuf3 + 1, stride, upBuff1_ref, endX, endY, stats_ref, count_ref);
         checked(opt, sbuf2 + 1, pbuf3 + 1, stride, upBuff1_vec, endX, endY, stats_vec, count_vec);
 
-        if (   memcmp(_upBuff1_ref, _upBuff1_vec, sizeof(_upBuff1_ref))
-            || memcmp(stats_ref, stats_vec, sizeof(stats_ref))
+        if (   memcmp(stats_ref, stats_vec, sizeof(stats_ref))
             || memcmp(count_ref, count_vec, sizeof(count_ref)))
             return false;
 
@@ -1425,10 +1424,7 @@
         ref(sbuf2 + 1, pbuf3 + 1, stride, upBuff1_ref, upBufft_ref, endX, endY, stats_ref, count_ref);
         checked(opt, sbuf2 + 1, pbuf3 + 1, stride, upBuff1_vec, upBufft_vec, endX, endY, stats_vec, count_vec);
 
-        // TODO: don't check upBuff*, the latest output pixels different, and can move into stack temporary buffer in future
-        if (   memcmp(_upBuff1_ref, _upBuff1_vec, sizeof(_upBuff1_ref))
-            || memcmp(_upBufft_ref, _upBufft_vec, sizeof(_upBufft_ref))
-            || memcmp(stats_ref, stats_vec, sizeof(stats_ref))
+        if (   memcmp(stats_ref, stats_vec, sizeof(stats_ref))
             || memcmp(count_ref, count_vec, sizeof(count_ref)))
             return false;
 
@@ -1476,8 +1472,7 @@
         ref(sbuf2, pbuf3, stride, upBuff1_ref, endX, endY, stats_ref, count_ref);
         checked(opt, sbuf2, pbuf3, stride, upBuff1_vec, endX, endY, stats_vec, count_vec);
 
-        if (   memcmp(_upBuff1_ref, _upBuff1_vec, sizeof(_upBuff1_ref))
-            || memcmp(stats_ref, stats_vec, sizeof(stats_ref))
+        if (   memcmp(stats_ref, stats_vec, sizeof(stats_ref))
             || memcmp(count_ref, count_vec, sizeof(count_ref)))
             return false;

 
@@ -1373,8 +1373,7 @@
         ref(sbuf2 + 1, pbuf3 + 1, stride, upBuff1_ref, endX, endY, stats_ref, count_ref);
         checked(opt, sbuf2 + 1, pbuf3 + 1, stride, upBuff1_vec, endX, endY, stats_vec, count_vec);
 
-        if (   memcmp(_upBuff1_ref, _upBuff1_vec, sizeof(_upBuff1_ref))
-            || memcmp(stats_ref, stats_vec, sizeof(stats_ref))
+        if (   memcmp(stats_ref, stats_vec, sizeof(stats_ref))
             || memcmp(count_ref, count_vec, sizeof(count_ref)))
             return false;
 
@@ -1425,10 +1424,7 @@
         ref(sbuf2 + 1, pbuf3 + 1, stride, upBuff1_ref, upBufft_ref, endX, endY, stats_ref, count_ref);
         checked(opt, sbuf2 + 1, pbuf3 + 1, stride, upBuff1_vec, upBufft_vec, endX, endY, stats_vec, count_vec);
 
-        // TODO: don't check upBuff*, the latest output pixels different, and can move into stack temporary buffer in future
-        if (   memcmp(_upBuff1_ref, _upBuff1_vec, sizeof(_upBuff1_ref))
-            || memcmp(_upBufft_ref, _upBufft_vec, sizeof(_upBufft_ref))
-            || memcmp(stats_ref, stats_vec, sizeof(stats_ref))
+        if (   memcmp(stats_ref, stats_vec, sizeof(stats_ref))
             || memcmp(count_ref, count_vec, sizeof(count_ref)))
             return false;
 
@@ -1476,8 +1472,7 @@
         ref(sbuf2, pbuf3, stride, upBuff1_ref, endX, endY, stats_ref, count_ref);
         checked(opt, sbuf2, pbuf3, stride, upBuff1_vec, endX, endY, stats_vec, count_vec);
 
-        if (   memcmp(_upBuff1_ref, _upBuff1_vec, sizeof(_upBuff1_ref))
-            || memcmp(stats_ref, stats_vec, sizeof(stats_ref))
+        if (   memcmp(stats_ref, stats_vec, sizeof(stats_ref))
             || memcmp(count_ref, count_vec, sizeof(count_ref)))
             return false;
 
​

x265_3.6.tar.gz/source/test/testbench.cpp -> x265_4.0.tar.gz/source/test/testbench.cpp Changed

@@ -159,10 +159,11 @@
 
     struct test_arch_t
     {
-        char name12;
+        char name13;
         int flag;
     } test_arch =
     {
+#if X265_ARCH_X86
         { "SSE2", X265_CPU_SSE2 },
         { "SSE3", X265_CPU_SSE3 },
         { "SSSE3", X265_CPU_SSSE3 },
@@ -172,11 +173,15 @@
         { "AVX2", X265_CPU_AVX2 },
         { "BMI2", X265_CPU_AVX2 | X265_CPU_BMI1 | X265_CPU_BMI2 },
         { "AVX512", X265_CPU_AVX512 },
+#else
         { "ARMv6", X265_CPU_ARMV6 },
         { "NEON", X265_CPU_NEON },
         { "SVE2", X265_CPU_SVE2 },
         { "SVE", X265_CPU_SVE },
+        { "Neon_DotProd", X265_CPU_NEON_DOTPROD },
+        { "Neon_I8MM", X265_CPU_NEON_I8MM },
         { "FastNeonMRC", X265_CPU_FAST_NEON_MRC },
+#endif
         { "", 0 },
     };
 
@@ -190,10 +195,10 @@
         else
             continue;
 
-#if X265_ARCH_X86
+#if defined(X265_ARCH_X86) || defined(X265_ARCH_ARM64)
         EncoderPrimitives vecprim;
         memset(&vecprim, 0, sizeof(vecprim));
-        setupInstrinsicPrimitives(vecprim, test_archi.flag);
+        setupIntrinsicPrimitives(vecprim, test_archi.flag);
         setupAliasPrimitives(vecprim);
         for (size_t h = 0; h < sizeof(harness) / sizeof(TestHarness*); h++)
         {
@@ -231,8 +236,8 @@
 
     EncoderPrimitives optprim;
     memset(&optprim, 0, sizeof(optprim));
-#if X265_ARCH_X86
-    setupInstrinsicPrimitives(optprim, cpuid);
+#if defined(X265_ARCH_X86) || defined(X265_ARCH_ARM64)
+    setupIntrinsicPrimitives(optprim, cpuid);
 #endif
 
     setupAssemblyPrimitives(optprim, cpuid);

 
@@ -159,10 +159,11 @@
 
     struct test_arch_t
     {
-        char name12;
+        char name13;
         int flag;
     } test_arch =
     {
+#if X265_ARCH_X86
         { "SSE2", X265_CPU_SSE2 },
         { "SSE3", X265_CPU_SSE3 },
         { "SSSE3", X265_CPU_SSSE3 },
@@ -172,11 +173,15 @@
         { "AVX2", X265_CPU_AVX2 },
         { "BMI2", X265_CPU_AVX2 | X265_CPU_BMI1 | X265_CPU_BMI2 },
         { "AVX512", X265_CPU_AVX512 },
+#else
         { "ARMv6", X265_CPU_ARMV6 },
         { "NEON", X265_CPU_NEON },
         { "SVE2", X265_CPU_SVE2 },
         { "SVE", X265_CPU_SVE },
+        { "Neon_DotProd", X265_CPU_NEON_DOTPROD },
+        { "Neon_I8MM", X265_CPU_NEON_I8MM },
         { "FastNeonMRC", X265_CPU_FAST_NEON_MRC },
+#endif
         { "", 0 },
     };
 
@@ -190,10 +195,10 @@
         else
             continue;
 
-#if X265_ARCH_X86
+#if defined(X265_ARCH_X86) || defined(X265_ARCH_ARM64)
         EncoderPrimitives vecprim;
         memset(&vecprim, 0, sizeof(vecprim));
-        setupInstrinsicPrimitives(vecprim, test_archi.flag);
+        setupIntrinsicPrimitives(vecprim, test_archi.flag);
         setupAliasPrimitives(vecprim);
         for (size_t h = 0; h < sizeof(harness) / sizeof(TestHarness*); h++)
         {
@@ -231,8 +236,8 @@
 
     EncoderPrimitives optprim;
     memset(&optprim, 0, sizeof(optprim));
-#if X265_ARCH_X86
-    setupInstrinsicPrimitives(optprim, cpuid);
+#if defined(X265_ARCH_X86) || defined(X265_ARCH_ARM64)
+    setupIntrinsicPrimitives(optprim, cpuid);
 #endif
 
     setupAssemblyPrimitives(optprim, cpuid);
​

x265_3.6.tar.gz/source/test/testharness.h -> x265_4.0.tar.gz/source/test/testharness.h Changed

 
@@ -88,6 +88,7 @@
     // TO-DO: replace clock() function with appropriate ARM cpu instructions
     a = clock();
 #elif  X265_ARCH_ARM64
+    asm volatile("isb" : : : "memory");
     asm volatile("mrs %0, cntvct_el0" : "=r"(a));
 #endif
     return a;
​

x265_3.6.tar.gz/source/x265.h -> x265_4.0.tar.gz/source/x265.h Changed

@@ -371,6 +371,11 @@
     MASTERING_DISPLAY_INFO               = 137,
     CONTENT_LIGHT_LEVEL_INFO             = 144,
     ALTERNATIVE_TRANSFER_CHARACTERISTICS = 147,
+    ALPHA_CHANNEL_INFO                   = 165,
+    THREE_DIMENSIONAL_REFERENCE_DISPLAYS_INFO = 176,
+    MULTIVIEW_SCENE_INFO                 = 178,
+    MULTIVIEW_ACQUISITION_INFO           = 179,
+    MULTIVIEW_VIEW_POSITION              = 180
 } SEIPayloadType;
 
 typedef struct x265_sei_payload
@@ -410,10 +415,10 @@
 
     /* Must be specified on input pictures, the number of planes is determined
      * by the colorSpace value */
-    void*   planes3;
+    void*   planes4;
 
     /* Stride is the number of bytes between row starts */
-    int     stride3;
+    int     stride4;
 
     /* Must be specified on input pictures. x265_picture_init() will set it to
      * the encoder's internal bit depth, but this field must describe the depth
@@ -487,6 +492,9 @@
     uint32_t picStruct;
 
     int    width;
+
+    int   layerID;
+    int    format;
 } x265_picture;
 
 typedef enum
@@ -536,11 +544,13 @@
 #define X265_CPU_SLOW_PALIGNR    (1 << 25)  /* such as on the AMD Bobcat */
 
 /* ARM */
-#define X265_CPU_ARMV6           0x0000001
-#define X265_CPU_NEON            0x0000002  /* ARM NEON */
-#define X265_CPU_SVE2            0x0000008  /* ARM SVE2 */
-#define X265_CPU_SVE             0x0000010  /* ARM SVE2 */
-#define X265_CPU_FAST_NEON_MRC   0x0000004  /* Transfer from NEON to ARM register is fast (Cortex-A9) */
+#define X265_CPU_ARMV6           (1 << 0)
+#define X265_CPU_NEON            (1 << 1)   /* ARM NEON */
+#define X265_CPU_FAST_NEON_MRC   (1 << 2)   /* Transfer from NEON to ARM register is fast (Cortex-A9) */
+#define X265_CPU_SVE2            (1 << 3)   /* AArch64 SVE2 */
+#define X265_CPU_SVE             (1 << 4)   /* AArch64 SVE2 */
+#define X265_CPU_NEON_DOTPROD    (1 << 5)   /* AArch64 Neon DotProd */
+#define X265_CPU_NEON_I8MM       (1 << 6)   /* AArch64 Neon I8MM */
 
 /* IBM Power8 */
 #define X265_CPU_ALTIVEC         0x0000001
@@ -623,13 +633,49 @@
 #define X265_MAX_GOP_LENGTH 16
 #define MAX_T_LAYERS 7
 
+#if ENABLE_MULTIVIEW
+#define MAX_VIEWS 2
+#define MAX_VPS_NUM_SCALABILITY_TYPES     16
+#define MAX_VPS_LAYER_ID_PLUS1            MAX_VIEWS
+#define MULTIVIEW_SCALABILITY_IDX         1
+#else
+#define MAX_VIEWS 1
+#endif
+
+#if ENABLE_ALPHA
+#define MAX_SCALABLE_LAYERS     2
+#define MAX_VPS_NUM_SCALABILITY_TYPES     16
+#define MAX_VPS_LAYER_ID_PLUS1            MAX_SCALABLE_LAYERS
+#else
+#define MAX_SCALABLE_LAYERS     1
+#endif
+
+#if ENABLE_ALPHA || ENABLE_MULTIVIEW
+#define MAX_LAYERS              2
+#else
+#define MAX_LAYERS              1
+#endif
+
+#if ENABLE_SCC_EXT
+/* SCC Extension Options */
+#define SCC_EXT_IDX               3
+#define NUM_EXTENSION_FLAGS       8
+#define SCM_S0067_NUM_CANDIDATES  64
+#define CHROMA_REFINEMENT_CANDIDATES  8
+#define SCM_S0067_IBC_FULL_1D_SEARCH_FOR_PU  2 ///< Do full horizontal/vertical search for Nx2N
+#define SCM_S0067_MAX_CAND_SIZE  32 ///< 32 or 64, 16 by default
+#define NUM_RECON_VERSION          2
+#else
+#define NUM_RECON_VERSION          1
+#endif
+
 #define X265_IPRATIO_STRENGTH   1.43
 
 typedef struct x265_cli_csp
 {
     int planes;
-    int width3;
-    int height3;
+    int width4;
+    int height4;
 } x265_cli_csp;
 
 static const x265_cli_csp x265_cli_csps =
@@ -754,10 +800,9 @@
     char *pool;
     int thread;
     int subsample;
-    int enable_conf_interval;
 }x265_vmaf_commondata;
 
-static const x265_vmaf_commondata vcd = { { NULL, (char *)"/usr/local/share/model/vmaf_v0.6.1.pkl", NULL, NULL, 0, 0, 0, 0, 0, 0, 0, NULL, 0, 1, 0 } };
+static x265_vmaf_commondata vcd = { { NULL, (char *)"/usr/local/share/model/vmaf_v0.6.1.json", NULL, NULL, 0, 0, 0, 0, 0, 0, 0, NULL, 0, 1} };
 
 typedef struct x265_temporal_layer {
     int poc_offset;      /* POC offset */
@@ -2268,6 +2313,20 @@
 
     /*SBRC*/
     int      bEnableSBRC;
+    int mcstfFrameRange;
+
+    /*Alpha channel encoding*/
+    int      bEnableAlpha;
+    int      numScalableLayers;
+
+    /*Multi View Encoding*/
+    int      numViews;
+    int      format;
+
+    int      numLayers;
+
+    /*Screen Content Coding*/
+    int     bEnableSCC;
 } x265_param;
 
 /* x265_param_alloc:
@@ -2320,6 +2379,10 @@
     "main444-12", "main444-12-intra",
 
     "main444-16-intra", "main444-16-stillpicture", /* Not Supported! */
+
+#if ENABLE_SCC_EXT
+    "main-scc", "main10-scc", "main444-scc", "main444-10-scc", /* Screen content coding */
+#endif
     0
 };
 
@@ -2430,7 +2493,7 @@
  *      the payloads of all output NALs are guaranteed to be sequential in memory.
  *      To flush the encoder and retrieve delayed output pictures, pass pic_in as NULL.
  *      Once flushing has begun, all subsequent calls must pass pic_in as NULL. */
-int x265_encoder_encode(x265_encoder *encoder, x265_nal **pp_nal, uint32_t *pi_nal, x265_picture *pic_in, x265_picture *pic_out);
+int x265_encoder_encode(x265_encoder *encoder, x265_nal **pp_nal, uint32_t *pi_nal, x265_picture *pic_in, x265_picture **pic_out);
 
 /* x265_encoder_reconfig:
  *      various parameters from x265_param are copied.
@@ -2537,7 +2600,7 @@
 
 /* x265_calculate_vmaf_framelevelscore:
  *    returns VMAF score for each frame in a given input video. */
-double x265_calculate_vmaf_framelevelscore(x265_vmaf_framedata*);
+double x265_calculate_vmaf_framelevelscore(x265_param*, x265_vmaf_framedata*);
 /* x265_vmaf_encoder_log:
  *       write a line to the configured CSV file.  If a CSV filename was not
  *       configured, or file open failed, this function will perform no write.
@@ -2584,7 +2647,7 @@
     int           (*encoder_reconfig)(x265_encoder*, x265_param*);
     int           (*encoder_reconfig_zone)(x265_encoder*, x265_zone*);
     int           (*encoder_headers)(x265_encoder*, x265_nal**, uint32_t*);
-    int           (*encoder_encode)(x265_encoder*, x265_nal**, uint32_t*, x265_picture*, x265_picture*);
+    int           (*encoder_encode)(x265_encoder*, x265_nal**, uint32_t*, x265_picture*, x265_picture**);
     void          (*encoder_get_stats)(x265_encoder*, x265_stats*, uint32_t);
     void          (*encoder_log)(x265_encoder*, int, char**);
     void          (*encoder_close)(x265_encoder*);
@@ -2602,7 +2665,7 @@
     int           (*set_analysis_data)(x265_encoder *encoder, x265_analysis_data *analysis_data, int poc, uint32_t cuBytes);
 #if ENABLE_LIBVMAF
     double        (*calculate_vmafscore)(x265_param *, x265_vmaf_data *);
-    double        (*calculate_vmaf_framelevelscore)(x265_vmaf_framedata *);
+    double        (*calculate_vmaf_framelevelscore)(x265_param *, x265_vmaf_framedata *);
     void          (*vmaf_encoder_log)(x265_encoder*, int, char**, x265_param *, x265_vmaf_data *);
 #endif
     int           (*zone_param_parse)(x265_param*, const char*, const char*);

 
@@ -371,6 +371,11 @@
     MASTERING_DISPLAY_INFO               = 137,
     CONTENT_LIGHT_LEVEL_INFO             = 144,
     ALTERNATIVE_TRANSFER_CHARACTERISTICS = 147,
+    ALPHA_CHANNEL_INFO                   = 165,
+    THREE_DIMENSIONAL_REFERENCE_DISPLAYS_INFO = 176,
+    MULTIVIEW_SCENE_INFO                 = 178,
+    MULTIVIEW_ACQUISITION_INFO           = 179,
+    MULTIVIEW_VIEW_POSITION              = 180
 } SEIPayloadType;
 
 typedef struct x265_sei_payload
@@ -410,10 +415,10 @@
 
     /* Must be specified on input pictures, the number of planes is determined
      * by the colorSpace value */
-    void*   planes3;
+    void*   planes4;
 
     /* Stride is the number of bytes between row starts */
-    int     stride3;
+    int     stride4;
 
     /* Must be specified on input pictures. x265_picture_init() will set it to
      * the encoder's internal bit depth, but this field must describe the depth
@@ -487,6 +492,9 @@
     uint32_t picStruct;
 
     int    width;
+
+    int   layerID;
+    int    format;
 } x265_picture;
 
 typedef enum
@@ -536,11 +544,13 @@
 #define X265_CPU_SLOW_PALIGNR    (1 << 25)  /* such as on the AMD Bobcat */
 
 /* ARM */
-#define X265_CPU_ARMV6           0x0000001
-#define X265_CPU_NEON            0x0000002  /* ARM NEON */
-#define X265_CPU_SVE2            0x0000008  /* ARM SVE2 */
-#define X265_CPU_SVE             0x0000010  /* ARM SVE2 */
-#define X265_CPU_FAST_NEON_MRC   0x0000004  /* Transfer from NEON to ARM register is fast (Cortex-A9) */
+#define X265_CPU_ARMV6           (1 << 0)
+#define X265_CPU_NEON            (1 << 1)   /* ARM NEON */
+#define X265_CPU_FAST_NEON_MRC   (1 << 2)   /* Transfer from NEON to ARM register is fast (Cortex-A9) */
+#define X265_CPU_SVE2            (1 << 3)   /* AArch64 SVE2 */
+#define X265_CPU_SVE             (1 << 4)   /* AArch64 SVE2 */
+#define X265_CPU_NEON_DOTPROD    (1 << 5)   /* AArch64 Neon DotProd */
+#define X265_CPU_NEON_I8MM       (1 << 6)   /* AArch64 Neon I8MM */
 
 /* IBM Power8 */
 #define X265_CPU_ALTIVEC         0x0000001
@@ -623,13 +633,49 @@
 #define X265_MAX_GOP_LENGTH 16
 #define MAX_T_LAYERS 7
 
+#if ENABLE_MULTIVIEW
+#define MAX_VIEWS 2
+#define MAX_VPS_NUM_SCALABILITY_TYPES     16
+#define MAX_VPS_LAYER_ID_PLUS1            MAX_VIEWS
+#define MULTIVIEW_SCALABILITY_IDX         1
+#else
+#define MAX_VIEWS 1
+#endif
+
+#if ENABLE_ALPHA
+#define MAX_SCALABLE_LAYERS     2
+#define MAX_VPS_NUM_SCALABILITY_TYPES     16
+#define MAX_VPS_LAYER_ID_PLUS1            MAX_SCALABLE_LAYERS
+#else
+#define MAX_SCALABLE_LAYERS     1
+#endif
+
+#if ENABLE_ALPHA || ENABLE_MULTIVIEW
+#define MAX_LAYERS              2
+#else
+#define MAX_LAYERS              1
+#endif
+
+#if ENABLE_SCC_EXT
+/* SCC Extension Options */
+#define SCC_EXT_IDX               3
+#define NUM_EXTENSION_FLAGS       8
+#define SCM_S0067_NUM_CANDIDATES  64
+#define CHROMA_REFINEMENT_CANDIDATES  8
+#define SCM_S0067_IBC_FULL_1D_SEARCH_FOR_PU  2 ///< Do full horizontal/vertical search for Nx2N
+#define SCM_S0067_MAX_CAND_SIZE  32 ///< 32 or 64, 16 by default
+#define NUM_RECON_VERSION          2
+#else
+#define NUM_RECON_VERSION          1
+#endif
+
 #define X265_IPRATIO_STRENGTH   1.43
 
 typedef struct x265_cli_csp
 {
     int planes;
-    int width3;
-    int height3;
+    int width4;
+    int height4;
 } x265_cli_csp;
 
 static const x265_cli_csp x265_cli_csps =
@@ -754,10 +800,9 @@
     char *pool;
     int thread;
     int subsample;
-    int enable_conf_interval;
 }x265_vmaf_commondata;
 
-static const x265_vmaf_commondata vcd = { { NULL, (char *)"/usr/local/share/model/vmaf_v0.6.1.pkl", NULL, NULL, 0, 0, 0, 0, 0, 0, 0, NULL, 0, 1, 0 } };
+static x265_vmaf_commondata vcd = { { NULL, (char *)"/usr/local/share/model/vmaf_v0.6.1.json", NULL, NULL, 0, 0, 0, 0, 0, 0, 0, NULL, 0, 1} };
 
 typedef struct x265_temporal_layer {
     int poc_offset;      /* POC offset */
@@ -2268,6 +2313,20 @@
 
     /*SBRC*/
     int      bEnableSBRC;
+    int mcstfFrameRange;
+
+    /*Alpha channel encoding*/
+    int      bEnableAlpha;
+    int      numScalableLayers;
+
+    /*Multi View Encoding*/
+    int      numViews;
+    int      format;
+
+    int      numLayers;
+
+    /*Screen Content Coding*/
+    int     bEnableSCC;
 } x265_param;
 
 /* x265_param_alloc:
@@ -2320,6 +2379,10 @@
     "main444-12", "main444-12-intra",
 
     "main444-16-intra", "main444-16-stillpicture", /* Not Supported! */
+
+#if ENABLE_SCC_EXT
+    "main-scc", "main10-scc", "main444-scc", "main444-10-scc", /* Screen content coding */
+#endif
     0
 };
 
@@ -2430,7 +2493,7 @@
  *      the payloads of all output NALs are guaranteed to be sequential in memory.
  *      To flush the encoder and retrieve delayed output pictures, pass pic_in as NULL.
  *      Once flushing has begun, all subsequent calls must pass pic_in as NULL. */
-int x265_encoder_encode(x265_encoder *encoder, x265_nal **pp_nal, uint32_t *pi_nal, x265_picture *pic_in, x265_picture *pic_out);
+int x265_encoder_encode(x265_encoder *encoder, x265_nal **pp_nal, uint32_t *pi_nal, x265_picture *pic_in, x265_picture **pic_out);
 
 /* x265_encoder_reconfig:
  *      various parameters from x265_param are copied.
@@ -2537,7 +2600,7 @@
 
 /* x265_calculate_vmaf_framelevelscore:
  *    returns VMAF score for each frame in a given input video. */
-double x265_calculate_vmaf_framelevelscore(x265_vmaf_framedata*);
+double x265_calculate_vmaf_framelevelscore(x265_param*, x265_vmaf_framedata*);
 /* x265_vmaf_encoder_log:
  *       write a line to the configured CSV file.  If a CSV filename was not
  *       configured, or file open failed, this function will perform no write.
@@ -2584,7 +2647,7 @@
     int           (*encoder_reconfig)(x265_encoder*, x265_param*);
     int           (*encoder_reconfig_zone)(x265_encoder*, x265_zone*);
     int           (*encoder_headers)(x265_encoder*, x265_nal**, uint32_t*);
-    int           (*encoder_encode)(x265_encoder*, x265_nal**, uint32_t*, x265_picture*, x265_picture*);
+    int           (*encoder_encode)(x265_encoder*, x265_nal**, uint32_t*, x265_picture*, x265_picture**);
     void          (*encoder_get_stats)(x265_encoder*, x265_stats*, uint32_t);
     void          (*encoder_log)(x265_encoder*, int, char**);
     void          (*encoder_close)(x265_encoder*);
@@ -2602,7 +2665,7 @@
     int           (*set_analysis_data)(x265_encoder *encoder, x265_analysis_data *analysis_data, int poc, uint32_t cuBytes);
 #if ENABLE_LIBVMAF
     double        (*calculate_vmafscore)(x265_param *, x265_vmaf_data *);
-    double        (*calculate_vmaf_framelevelscore)(x265_vmaf_framedata *);
+    double        (*calculate_vmaf_framelevelscore)(x265_param *, x265_vmaf_framedata *);
     void          (*vmaf_encoder_log)(x265_encoder*, int, char**, x265_param *, x265_vmaf_data *);
 #endif
     int           (*zone_param_parse)(x265_param*, const char*, const char*);
​

x265_3.6.tar.gz/source/x265cli.cpp -> x265_4.0.tar.gz/source/x265cli.cpp Changed

@@ -374,6 +374,17 @@
         H0("   --no-frame-dup              Enable Frame duplication. Default %s\n", OPT(param->bEnableFrameDuplication));
         H0("   --dup-threshold <integer>     PSNR threshold for Frame duplication. Default %d\n", param->dupThreshold);
         H0("   --no-mcstf                  Enable GOP based temporal filter. Default %d\n", param->bEnableTemporalFilter);
+#if ENABLE_ALPHA
+        H0("   --alpha                       Enable alpha channel support. Default %d\n", param->bEnableAlpha);
+#endif
+#if ENABLE_MULTIVIEW
+        H0("   --num-views                   Number of Views for Multiview Encoding. Default %d\n", param->numViews);
+        H0("   --format                      Format of the input video 0 : normal, 1 : side-by-side, 2 : over-under  Default %d\n", param->format);
+        H0("   --multiview-config            Configuration file for Multiview Encoding\n");
+#endif
+#if ENABLE_SCC_EXT
+        H0("   --scc <integer>               Enable screen content coding. 0: Diabled, 1:Intrablockcopy fast search with 1x2 CTUs search range, 2: Intrablockcopy Full search. Default %d\n", param->bEnableSCC);
+#endif
 #ifdef SVT_HEVC
         H0("   --nosvt                     Enable SVT HEVC encoder %s\n", OPT(param->bEnableSvtHevc));
         H0("   --no-svt-hme                Enable Hierarchial motion estimation(HME) in SVT HEVC encoder \n");
@@ -416,12 +427,18 @@
             free(argString);
         }
 
-        if (input)
-            input->release();
-        input = NULL;
-        if (recon)
-            recon->release();
-        recon = NULL;
+        for (int i = 0; i < MAX_VIEWS; i++)
+        {
+            if (inputi)
+                inputi->release();
+            inputi = NULL;
+        }
+        for (int i = 0; i < MAX_LAYERS; i++)
+        {
+            if (reconi)
+                reconi->release();
+            reconi = NULL;
+        }
         if (qpfile)
             fclose(qpfile);
         qpfile = NULL;
@@ -577,8 +594,12 @@
         int inputBitDepth = 8;
         int outputBitDepth = 0;
         int reconFileBitDepth = 0;
-        const char *inputfn = NULL;
-        const char *reconfn = NULL;
+        char* inputfnMAX_VIEWS = { NULL };
+        for (int view = 0; view < MAX_VIEWS; view++)
+        {
+            inputfnview = X265_MALLOC(char, sizeof(char) * 1024);
+        }
+        const char* reconfnMAX_LAYERS = { NULL };
         const char *outputfn = NULL;
         const char *preset = NULL;
         const char *tune = NULL;
@@ -717,8 +738,8 @@
                 OPT("frames") this->framesToBeEncoded = (uint32_t)x265_atoi(optarg, bError);
                 OPT("no-progress") this->bProgress = false;
                 OPT("output") outputfn = optarg;
-                OPT("input") inputfn = optarg;
-                OPT("recon") reconfn = optarg;
+                OPT("input") strcpy(inputfn0 , optarg);
+                OPT("recon") reconfn0 = optarg;
                 OPT("input-depth") inputBitDepth = (uint32_t)x265_atoi(optarg, bError);
                 OPT("dither") this->bDither = true;
                 OPT("recon-depth") reconFileBitDepth = (uint32_t)x265_atoi(optarg, bError);
@@ -750,6 +771,14 @@
                     if (!this->scenecutAwareQpConfig)
                         x265_log_file(param, X265_LOG_ERROR, "%s scenecut aware qp config file not found or error in opening config file\n", optarg);
                 }
+#if ENABLE_MULTIVIEW
+                OPT("multiview-config")
+                {
+                    this->multiViewConfig = x265_fopen(optarg, "rb");
+                    if (!this->multiViewConfig)
+                        x265_log_file(param, X265_LOG_ERROR, "%s Multiview config file not found or error in opening config file\n", optarg);
+                }
+#endif
                 OPT("zonefile")
                 {
                     this->zoneFile = x265_fopen(optarg, "rb");
@@ -776,8 +805,10 @@
             }
         }
 
-        if (optind < argc && !inputfn)
-            inputfn = argvoptind++;
+#if !ENABLE_MULTIVIEW
+        if (optind < argc && !inputfn0)
+            inputfn0 = argvoptind++;
+#endif
         if (optind < argc && !outputfn)
             outputfn = argvoptind++;
         if (optind < argc)
@@ -793,9 +824,29 @@
             showHelp(param);
         }
 
-        if (!inputfn || !outputfn)
+#if ENABLE_MULTIVIEW
+        if (this->multiViewConfig)
+        {
+            if (!this->parseMultiViewConfig(inputfn))
+            {
+                x265_log(NULL, X265_LOG_ERROR, "Unable to parse multiview config file \n");
+                fclose(this->multiViewConfig);
+                this->multiViewConfig = NULL;
+            }
+        }
+#endif
+        param->numLayers = param->numViews > 1 ? param->numViews : (param->numScalableLayers > 1) ? param->numScalableLayers : 1;
+        if (!outputfn)
         {
             x265_log(param, X265_LOG_ERROR, "input or output file not specified, try --help for help\n");
+            for (int view = 0; view < param->numViews; view++)
+            {
+                if (!inputfnview)
+                {
+                    x265_log(param, X265_LOG_ERROR, "input or output file not specified, try --help for help\n");
+                    return true;
+                }
+            }
             return true;
         }
 
@@ -816,51 +867,53 @@
             svtParam->encoderBitDepth = inputBitDepth;
         }
 #endif
-
-        InputFileInfo info;
-        info.filename = inputfn;
-        info.depth = inputBitDepth;
-        info.csp = param->internalCsp;
-        info.width = param->sourceWidth;
-        info.height = param->sourceHeight;
-        info.fpsNum = param->fpsNum;
-        info.fpsDenom = param->fpsDenom;
-        info.sarWidth = param->vui.sarWidth;
-        info.sarHeight = param->vui.sarHeight;
-        info.skipFrames = seek;
-        info.frameCount = 0;
-        getParamAspectRatio(param, info.sarWidth, info.sarHeight);
-
-
-        this->input = InputFile::open(info, this->bForceY4m);
-        if (!this->input || this->input->isFail())
+        InputFileInfo infoMAX_VIEWS;
+        for (int i = 0; i < param->numViews - !!param->format; i++)
         {
-            x265_log_file(param, X265_LOG_ERROR, "unable to open input file <%s>\n", inputfn);
-            return true;
-        }
+            infoi.filename = inputfni;
+            infoi.depth = inputBitDepth;
+            infoi.csp = param->internalCsp;
+            infoi.width = param->sourceWidth;
+            infoi.height = param->sourceHeight;
+            infoi.fpsNum = param->fpsNum;
+            infoi.fpsDenom = param->fpsDenom;
+            infoi.sarWidth = param->vui.sarWidth;
+            infoi.sarHeight = param->vui.sarHeight;
+            infoi.skipFrames = seek;
+            infoi.frameCount = 0;
+            getParamAspectRatio(param, infoi.sarWidth, infoi.sarHeight);
+
+            this->inputi = InputFile::open(infoi, this->bForceY4m, param->numScalableLayers > 1, param->format);
+            if (!this->inputi || this->inputi->isFail())
+            {
+                x265_log_file(param, X265_LOG_ERROR, "unable to open input file <%s>\n", inputfni);
+                return true;
+            }
 
-        if (info.depth < 8 || info.depth > 16)
-        {
-            x265_log(param, X265_LOG_ERROR, "Input bit depth (%d) must be between 8 and 16\n", inputBitDepth);
-            return true;
+            if (infoi.depth < 8 || infoi.depth > 16)
+            {
+                x265_log(param, X265_LOG_ERROR, "Input bit depth (%d) must be between 8 and 16\n", inputBitDepth);
+                return true;
+            }
         }
 
+            //TODO:Validate info params of both the views to equal values
         /* Unconditionally accept height/width/csp/bitDepth from file info */
-        param->sourceWidth = info.width;
-        param->sourceHeight = info.height;
-        param->internalCsp = info.csp;
-        param->sourceBitDepth = info.depth;
+            param->sourceWidth = info0.width;
+            param->sourceHeight = info0.height;
+            param->internalCsp = info0.csp;
+            param->sourceBitDepth = info0.depth;
 
         /* Accept fps and sar from file info if not specified by user */
         if (param->fpsDenom == 0 || param->fpsNum == 0)

 
@@ -374,6 +374,17 @@
         H0("   --no-frame-dup              Enable Frame duplication. Default %s\n", OPT(param->bEnableFrameDuplication));
         H0("   --dup-threshold <integer>     PSNR threshold for Frame duplication. Default %d\n", param->dupThreshold);
         H0("   --no-mcstf                  Enable GOP based temporal filter. Default %d\n", param->bEnableTemporalFilter);
+#if ENABLE_ALPHA
+        H0("   --alpha                       Enable alpha channel support. Default %d\n", param->bEnableAlpha);
+#endif
+#if ENABLE_MULTIVIEW
+        H0("   --num-views                   Number of Views for Multiview Encoding. Default %d\n", param->numViews);
+        H0("   --format                      Format of the input video 0 : normal, 1 : side-by-side, 2 : over-under  Default %d\n", param->format);
+        H0("   --multiview-config            Configuration file for Multiview Encoding\n");
+#endif
+#if ENABLE_SCC_EXT
+        H0("   --scc <integer>               Enable screen content coding. 0: Diabled, 1:Intrablockcopy fast search with 1x2 CTUs search range, 2: Intrablockcopy Full search. Default %d\n", param->bEnableSCC);
+#endif
 #ifdef SVT_HEVC
         H0("   --nosvt                     Enable SVT HEVC encoder %s\n", OPT(param->bEnableSvtHevc));
         H0("   --no-svt-hme                Enable Hierarchial motion estimation(HME) in SVT HEVC encoder \n");
@@ -416,12 +427,18 @@
             free(argString);
         }
 
-        if (input)
-            input->release();
-        input = NULL;
-        if (recon)
-            recon->release();
-        recon = NULL;
+        for (int i = 0; i < MAX_VIEWS; i++)
+        {
+            if (inputi)
+                inputi->release();
+            inputi = NULL;
+        }
+        for (int i = 0; i < MAX_LAYERS; i++)
+        {
+            if (reconi)
+                reconi->release();
+            reconi = NULL;
+        }
         if (qpfile)
             fclose(qpfile);
         qpfile = NULL;
@@ -577,8 +594,12 @@
         int inputBitDepth = 8;
         int outputBitDepth = 0;
         int reconFileBitDepth = 0;
-        const char *inputfn = NULL;
-        const char *reconfn = NULL;
+        char* inputfnMAX_VIEWS = { NULL };
+        for (int view = 0; view < MAX_VIEWS; view++)
+        {
+            inputfnview = X265_MALLOC(char, sizeof(char) * 1024);
+        }
+        const char* reconfnMAX_LAYERS = { NULL };
         const char *outputfn = NULL;
         const char *preset = NULL;
         const char *tune = NULL;
@@ -717,8 +738,8 @@
                 OPT("frames") this->framesToBeEncoded = (uint32_t)x265_atoi(optarg, bError);
                 OPT("no-progress") this->bProgress = false;
                 OPT("output") outputfn = optarg;
-                OPT("input") inputfn = optarg;
-                OPT("recon") reconfn = optarg;
+                OPT("input") strcpy(inputfn0 , optarg);
+                OPT("recon") reconfn0 = optarg;
                 OPT("input-depth") inputBitDepth = (uint32_t)x265_atoi(optarg, bError);
                 OPT("dither") this->bDither = true;
                 OPT("recon-depth") reconFileBitDepth = (uint32_t)x265_atoi(optarg, bError);
@@ -750,6 +771,14 @@
                     if (!this->scenecutAwareQpConfig)
                         x265_log_file(param, X265_LOG_ERROR, "%s scenecut aware qp config file not found or error in opening config file\n", optarg);
                 }
+#if ENABLE_MULTIVIEW
+                OPT("multiview-config")
+                {
+                    this->multiViewConfig = x265_fopen(optarg, "rb");
+                    if (!this->multiViewConfig)
+                        x265_log_file(param, X265_LOG_ERROR, "%s Multiview config file not found or error in opening config file\n", optarg);
+                }
+#endif
                 OPT("zonefile")
                 {
                     this->zoneFile = x265_fopen(optarg, "rb");
@@ -776,8 +805,10 @@
             }
         }
 
-        if (optind < argc && !inputfn)
-            inputfn = argvoptind++;
+#if !ENABLE_MULTIVIEW
+        if (optind < argc && !inputfn0)
+            inputfn0 = argvoptind++;
+#endif
         if (optind < argc && !outputfn)
             outputfn = argvoptind++;
         if (optind < argc)
@@ -793,9 +824,29 @@
             showHelp(param);
         }
 
-        if (!inputfn || !outputfn)
+#if ENABLE_MULTIVIEW
+        if (this->multiViewConfig)
+        {
+            if (!this->parseMultiViewConfig(inputfn))
+            {
+                x265_log(NULL, X265_LOG_ERROR, "Unable to parse multiview config file \n");
+                fclose(this->multiViewConfig);
+                this->multiViewConfig = NULL;
+            }
+        }
+#endif
+        param->numLayers = param->numViews > 1 ? param->numViews : (param->numScalableLayers > 1) ? param->numScalableLayers : 1;
+        if (!outputfn)
         {
             x265_log(param, X265_LOG_ERROR, "input or output file not specified, try --help for help\n");
+            for (int view = 0; view < param->numViews; view++)
+            {
+                if (!inputfnview)
+                {
+                    x265_log(param, X265_LOG_ERROR, "input or output file not specified, try --help for help\n");
+                    return true;
+                }
+            }
             return true;
         }
 
@@ -816,51 +867,53 @@
             svtParam->encoderBitDepth = inputBitDepth;
         }
 #endif
-
-        InputFileInfo info;
-        info.filename = inputfn;
-        info.depth = inputBitDepth;
-        info.csp = param->internalCsp;
-        info.width = param->sourceWidth;
-        info.height = param->sourceHeight;
-        info.fpsNum = param->fpsNum;
-        info.fpsDenom = param->fpsDenom;
-        info.sarWidth = param->vui.sarWidth;
-        info.sarHeight = param->vui.sarHeight;
-        info.skipFrames = seek;
-        info.frameCount = 0;
-        getParamAspectRatio(param, info.sarWidth, info.sarHeight);
-
-
-        this->input = InputFile::open(info, this->bForceY4m);
-        if (!this->input || this->input->isFail())
+        InputFileInfo infoMAX_VIEWS;
+        for (int i = 0; i < param->numViews - !!param->format; i++)
         {
-            x265_log_file(param, X265_LOG_ERROR, "unable to open input file <%s>\n", inputfn);
-            return true;
-        }
+            infoi.filename = inputfni;
+            infoi.depth = inputBitDepth;
+            infoi.csp = param->internalCsp;
+            infoi.width = param->sourceWidth;
+            infoi.height = param->sourceHeight;
+            infoi.fpsNum = param->fpsNum;
+            infoi.fpsDenom = param->fpsDenom;
+            infoi.sarWidth = param->vui.sarWidth;
+            infoi.sarHeight = param->vui.sarHeight;
+            infoi.skipFrames = seek;
+            infoi.frameCount = 0;
+            getParamAspectRatio(param, infoi.sarWidth, infoi.sarHeight);
+
+            this->inputi = InputFile::open(infoi, this->bForceY4m, param->numScalableLayers > 1, param->format);
+            if (!this->inputi || this->inputi->isFail())
+            {
+                x265_log_file(param, X265_LOG_ERROR, "unable to open input file <%s>\n", inputfni);
+                return true;
+            }
 
-        if (info.depth < 8 || info.depth > 16)
-        {
-            x265_log(param, X265_LOG_ERROR, "Input bit depth (%d) must be between 8 and 16\n", inputBitDepth);
-            return true;
+            if (infoi.depth < 8 || infoi.depth > 16)
+            {
+                x265_log(param, X265_LOG_ERROR, "Input bit depth (%d) must be between 8 and 16\n", inputBitDepth);
+                return true;
+            }
         }
 
+            //TODO:Validate info params of both the views to equal values
         /* Unconditionally accept height/width/csp/bitDepth from file info */
-        param->sourceWidth = info.width;
-        param->sourceHeight = info.height;
-        param->internalCsp = info.csp;
-        param->sourceBitDepth = info.depth;
+            param->sourceWidth = info0.width;
+            param->sourceHeight = info0.height;
+            param->internalCsp = info0.csp;
+            param->sourceBitDepth = info0.depth;
 
         /* Accept fps and sar from file info if not specified by user */
         if (param->fpsDenom == 0 || param->fpsNum == 0)
​

x265_3.6.tar.gz/source/x265cli.h -> x265_4.0.tar.gz/source/x265cli.h Changed

@@ -358,6 +358,17 @@
     { "dup-threshold", required_argument, NULL, 0 },
     { "mcstf",                 no_argument, NULL, 0 },
     { "no-mcstf",              no_argument, NULL, 0 },
+#if ENABLE_ALPHA
+    { "alpha",                 no_argument, NULL, 0 },
+#endif
+#if ENABLE_MULTIVIEW
+    { "num-views", required_argument, NULL, 0 },
+    { "multiview-config", required_argument, NULL, 0 },
+    { "format", required_argument, NULL, 0 },
+#endif
+#if ENABLE_SCC_EXT
+    { "scc",        required_argument, NULL, 0 },
+#endif
 #ifdef SVT_HEVC
     { "svt",     no_argument, NULL, 0 },
     { "no-svt",  no_argument, NULL, 0 },
@@ -393,13 +404,16 @@
 
     struct CLIOptions
     {
-        InputFile* input;
-        ReconFile* recon;
+        InputFile* inputMAX_VIEWS;
+        ReconFile* reconMAX_LAYERS;
         OutputFile* output;
         FILE*       qpfile;
         FILE*       zoneFile;
         FILE*    dolbyVisionRpu;    /* File containing Dolby Vision BL RPU metadata */
         FILE*    scenecutAwareQpConfig; /* File containing scenecut aware frame quantization related CLI options */
+#if ENABLE_MULTIVIEW
+        FILE* multiViewConfig; /* File containing multi-view related CLI options */
+#endif
         const char* reconPlayCmd;
         const x265_api* api;
         x265_param* param;
@@ -431,13 +445,18 @@
         static const int UPDATE_INTERVAL = 250000;
         CLIOptions()
         {
-            input = NULL;
-            recon = NULL;
+            for (int i = 0; i < MAX_VIEWS; i++)
+                inputi = NULL;
+            for (int i = 0; i < MAX_LAYERS; i++)
+                reconi = NULL;
             output = NULL;
             qpfile = NULL;
             zoneFile = NULL;
             dolbyVisionRpu = NULL;
             scenecutAwareQpConfig = NULL;
+#if ENABLE_MULTIVIEW
+            multiViewConfig = NULL;
+#endif
             reconPlayCmd = NULL;
             api = NULL;
             param = NULL;
@@ -470,6 +489,9 @@
         int rpuParser(x265_picture * pic);
         bool parseScenecutAwareQpConfig();
         bool parseScenecutAwareQpParam(int argc, char **argv, x265_param* globalParam);
+#if ENABLE_MULTIVIEW
+        bool parseMultiViewConfig(char** fn);
+#endif
     };
 #ifdef __cplusplus
 }

 
@@ -358,6 +358,17 @@
     { "dup-threshold", required_argument, NULL, 0 },
     { "mcstf",                 no_argument, NULL, 0 },
     { "no-mcstf",              no_argument, NULL, 0 },
+#if ENABLE_ALPHA
+    { "alpha",                 no_argument, NULL, 0 },
+#endif
+#if ENABLE_MULTIVIEW
+    { "num-views", required_argument, NULL, 0 },
+    { "multiview-config", required_argument, NULL, 0 },
+    { "format", required_argument, NULL, 0 },
+#endif
+#if ENABLE_SCC_EXT
+    { "scc",        required_argument, NULL, 0 },
+#endif
 #ifdef SVT_HEVC
     { "svt",     no_argument, NULL, 0 },
     { "no-svt",  no_argument, NULL, 0 },
@@ -393,13 +404,16 @@
 
     struct CLIOptions
     {
-        InputFile* input;
-        ReconFile* recon;
+        InputFile* inputMAX_VIEWS;
+        ReconFile* reconMAX_LAYERS;
         OutputFile* output;
         FILE*       qpfile;
         FILE*       zoneFile;
         FILE*    dolbyVisionRpu;    /* File containing Dolby Vision BL RPU metadata */
         FILE*    scenecutAwareQpConfig; /* File containing scenecut aware frame quantization related CLI options */
+#if ENABLE_MULTIVIEW
+        FILE* multiViewConfig; /* File containing multi-view related CLI options */
+#endif
         const char* reconPlayCmd;
         const x265_api* api;
         x265_param* param;
@@ -431,13 +445,18 @@
         static const int UPDATE_INTERVAL = 250000;
         CLIOptions()
         {
-            input = NULL;
-            recon = NULL;
+            for (int i = 0; i < MAX_VIEWS; i++)
+                inputi = NULL;
+            for (int i = 0; i < MAX_LAYERS; i++)
+                reconi = NULL;
             output = NULL;
             qpfile = NULL;
             zoneFile = NULL;
             dolbyVisionRpu = NULL;
             scenecutAwareQpConfig = NULL;
+#if ENABLE_MULTIVIEW
+            multiViewConfig = NULL;
+#endif
             reconPlayCmd = NULL;
             api = NULL;
             param = NULL;
@@ -470,6 +489,9 @@
         int rpuParser(x265_picture * pic);
         bool parseScenecutAwareQpConfig();
         bool parseScenecutAwareQpParam(int argc, char **argv, x265_param* globalParam);
+#if ENABLE_MULTIVIEW
+        bool parseMultiViewConfig(char** fn);
+#endif
     };
 #ifdef __cplusplus
 }
​

x265_3.6.tar.gz/x265Version.txt -> x265_4.0.tar.gz/x265Version.txt Changed

 
@@ -1,4 +1,4 @@
 #Attribute:         Values
-repositorychangeset: aa7f602f7
+repositorychangeset: 6318f22
 releasetagdistance: 1
-releasetag: 3.6
+releasetag: 4.0
​