Packman Build Service PMBS

We truncated the diff of some files because they were too big. If you want to see the full diff for every file, click here.

Changes of Revision 42

x265.changes Changed

@@ -1,4 +1,53 @@
 -------------------------------------------------------------------
+Thu Jun 13 05:58:19 UTC 2024 - Luigi Baldoni <aloisio@gmx.com>
+
+- Update to version 3.6
+  New features:
+  * Segment based Ratecontrol (SBRC) feature
+  * Motion-Compensated Spatio-Temporal Filtering
+  * Scene-cut aware qp - BBAQ (Bidirectional Boundary Aware
+    Quantization)
+  * Histogram-Based Scene Change Detection
+  * Film-Grain characteristics as a SEI message to support Film
+    Grain Synthesis(FGS)
+  * Add temporal layer implementation(Hierarchical B-frame
+    implementation)
+  Enhancements to existing features:
+  * Added Dolby Vision 8.4 Profile Support
+  API changes:
+  * Add Segment based Ratecontrol(SBRC) feature: "--no-sbrc".
+  * Add command line parameter for mcstf feature: "--no-mctf".
+  * Add command line parameters for the scene cut aware qp
+    feature: "--scenecut-aware-qp" and "--masking-strength".
+  * Add command line parameters for Histogram-Based Scene Change
+    Detection: "--hist-scenecut".
+  * Add film grain characteristics as a SEI message to the
+    bitstream: "--film-grain <filename>"
+  * cli: add new option --cra-nal (Force nal type to CRA to all
+    frames expect for the first frame, works only with keyint 1)
+  Optimizations:
+  * ARM64 NEON optimizations:- Several time-consuming C
+    functions have been optimized for the targeted platform -
+    aarch64. The overall performance increased by around 20%.
+  * SVE/SVE2 optimizations
+  Bug fixes:
+  * Linux bug to utilize all the cores
+  * Crash with hist-scenecut build when source resolution is not
+    multiple of minCuSize
+  * 32bit and 64bit builds generation for ARM
+  * bugs in zonefile feature (Reflect Zonefile Parameters inside
+    Lookahead, extra IDR issue, Avg I Slice QP value issue etc..)
+  * Add x86 ASM implementation for subsampling luma
+  * Fix for abrladder segfault with load reuse level 1
+  * Reorder miniGOP based on temporal layer hierarchy and add
+    support for more B frame
+  * Add MacOS aarch64 build support
+  * Fix boundary condition issue for Gaussian filter
+- Drop arm.patch and replace it with 0001-Fix-arm-flags.patch
+  and 0004-Do-not-build-with-assembly-support-on-arm.patch
+  (courtesy of Debian)
+
+-------------------------------------------------------------------
 Wed May 19 13:21:09 UTC 2021 - Luigi Baldoni <aloisio@gmx.com>
 
 - Build libx265_main10 and libx265_main12 unconditionally and

​x
 
@@ -1,4 +1,53 @@
 -------------------------------------------------------------------
+Thu Jun 13 05:58:19 UTC 2024 - Luigi Baldoni <aloisio@gmx.com>
+
+- Update to version 3.6
+  New features:
+  * Segment based Ratecontrol (SBRC) feature
+  * Motion-Compensated Spatio-Temporal Filtering
+  * Scene-cut aware qp - BBAQ (Bidirectional Boundary Aware
+    Quantization)
+  * Histogram-Based Scene Change Detection
+  * Film-Grain characteristics as a SEI message to support Film
+    Grain Synthesis(FGS)
+  * Add temporal layer implementation(Hierarchical B-frame
+    implementation)
+  Enhancements to existing features:
+  * Added Dolby Vision 8.4 Profile Support
+  API changes:
+  * Add Segment based Ratecontrol(SBRC) feature: "--no-sbrc".
+  * Add command line parameter for mcstf feature: "--no-mctf".
+  * Add command line parameters for the scene cut aware qp
+    feature: "--scenecut-aware-qp" and "--masking-strength".
+  * Add command line parameters for Histogram-Based Scene Change
+    Detection: "--hist-scenecut".
+  * Add film grain characteristics as a SEI message to the
+    bitstream: "--film-grain <filename>"
+  * cli: add new option --cra-nal (Force nal type to CRA to all
+    frames expect for the first frame, works only with keyint 1)
+  Optimizations:
+  * ARM64 NEON optimizations:- Several time-consuming C
+    functions have been optimized for the targeted platform -
+    aarch64. The overall performance increased by around 20%.
+  * SVE/SVE2 optimizations
+  Bug fixes:
+  * Linux bug to utilize all the cores
+  * Crash with hist-scenecut build when source resolution is not
+    multiple of minCuSize
+  * 32bit and 64bit builds generation for ARM
+  * bugs in zonefile feature (Reflect Zonefile Parameters inside
+    Lookahead, extra IDR issue, Avg I Slice QP value issue etc..)
+  * Add x86 ASM implementation for subsampling luma
+  * Fix for abrladder segfault with load reuse level 1
+  * Reorder miniGOP based on temporal layer hierarchy and add
+    support for more B frame
+  * Add MacOS aarch64 build support
+  * Fix boundary condition issue for Gaussian filter
+- Drop arm.patch and replace it with 0001-Fix-arm-flags.patch
+  and 0004-Do-not-build-with-assembly-support-on-arm.patch
+  (courtesy of Debian)
+
+-------------------------------------------------------------------
 Wed May 19 13:21:09 UTC 2021 - Luigi Baldoni <aloisio@gmx.com>
 
 - Build libx265_main10 and libx265_main12 unconditionally and
​

x265.spec Changed

@@ -1,7 +1,7 @@
 #
 # spec file for package x265
 #
-# Copyright (c) 2021 Packman Team <packman@links2linux.de>
+# Copyright (c) 2024 Packman Team <packman@links2linux.de>
 # Copyright (c) 2014 Torsten Gruner <t.gruner@katodev.de>
 #
 # All modifications and additions to the file contributed by third parties
@@ -17,21 +17,22 @@
 #
 
 
-%define sover   199
+%define sover   209
 %define libname lib%{name}
 %define libsoname %{libname}-%{sover}
-%define uver    3_5
+%define uver    3_6
 Name:           x265
-Version:        3.5
+Version:        3.6
 Release:        0
 Summary:        A free h265/HEVC encoder - encoder binary
 License:        GPL-2.0-or-later
 Group:          Productivity/Multimedia/Video/Editors and Convertors
 URL:            https://bitbucket.org/multicoreware/x265_git
 Source0:        https://bitbucket.org/multicoreware/x265_git/downloads/%{name}_%{version}.tar.gz
-Patch0:         arm.patch
 Patch1:         x265.pkgconfig.patch
 Patch2:         x265-fix_enable512.patch
+Patch3:         0001-Fix-arm-flags.patch
+Patch4:         0004-Do-not-build-with-assembly-support-on-arm.patch
 BuildRequires:  cmake >= 2.8.8
 BuildRequires:  gcc-c++
 BuildRequires:  nasm >= 2.13
@@ -130,6 +131,8 @@
 %cmake_install
 find %{buildroot} -type f -name "*.a" -delete -print0
 
+%check
+
 %post -n %{libsoname} -p /sbin/ldconfig
 %postun -n %{libsoname} -p /sbin/ldconfig

 
@@ -1,7 +1,7 @@
 #
 # spec file for package x265
 #
-# Copyright (c) 2021 Packman Team <packman@links2linux.de>
+# Copyright (c) 2024 Packman Team <packman@links2linux.de>
 # Copyright (c) 2014 Torsten Gruner <t.gruner@katodev.de>
 #
 # All modifications and additions to the file contributed by third parties
@@ -17,21 +17,22 @@
 #
 
 
-%define sover   199
+%define sover   209
 %define libname lib%{name}
 %define libsoname %{libname}-%{sover}
-%define uver    3_5
+%define uver    3_6
 Name:           x265
-Version:        3.5
+Version:        3.6
 Release:        0
 Summary:        A free h265/HEVC encoder - encoder binary
 License:        GPL-2.0-or-later
 Group:          Productivity/Multimedia/Video/Editors and Convertors
 URL:            https://bitbucket.org/multicoreware/x265_git
 Source0:        https://bitbucket.org/multicoreware/x265_git/downloads/%{name}_%{version}.tar.gz
-Patch0:         arm.patch
 Patch1:         x265.pkgconfig.patch
 Patch2:         x265-fix_enable512.patch
+Patch3:         0001-Fix-arm-flags.patch
+Patch4:         0004-Do-not-build-with-assembly-support-on-arm.patch
 BuildRequires:  cmake >= 2.8.8
 BuildRequires:  gcc-c++
 BuildRequires:  nasm >= 2.13
@@ -130,6 +131,8 @@
 %cmake_install
 find %{buildroot} -type f -name "*.a" -delete -print0
 
+%check
+
 %post -n %{libsoname} -p /sbin/ldconfig
 %postun -n %{libsoname} -p /sbin/ldconfig
 
​

0001-Fix-arm-flags.patch Added

@@ -0,0 +1,39 @@
+From: Sebastian Ramacher <sramacher@debian.org>
+Date: Sun, 21 Jun 2020 17:54:56 +0200
+Subject: Fix arm* flags
+
+---
+ source/CMakeLists.txt | 7 ++-----
+ 1 file changed, 2 insertions(+), 5 deletions(-)
+
+diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
+index ab5ddfe..eb9b19b 100755
+--- a/source/CMakeLists.txt
++++ b/source/CMakeLists.txt
+@@ -253,10 +253,7 @@ if(GCC)
+     elseif(ARM)
+         find_package(Neon)
+         if(CPU_HAS_NEON)
+-            set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=neon -marm -fPIC)
+             add_definitions(-DHAVE_NEON)
+-        else()
+-            set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=vfp -marm)
+         endif()
+     endif()
+ 	if(ARM64 OR CROSS_COMPILE_ARM64)
+@@ -265,13 +262,13 @@ if(GCC)
+         find_package(SVE2)
+         if(CPU_HAS_SVE2 OR CROSS_COMPILE_SVE2)
+             message(STATUS "Found SVE2")
+-	        set(ARM_ARGS -O3 -march=armv8-a+sve2 -fPIC -flax-vector-conversions)
++	        set(ARM_ARGS -fPIC -flax-vector-conversions)
+             add_definitions(-DHAVE_SVE2)
+             add_definitions(-DHAVE_SVE)
+             add_definitions(-DHAVE_NEON) # for NEON c/c++ primitives, as currently there is no implementation that use SVE2
+         elseif(CPU_HAS_SVE OR CROSS_COMPILE_SVE)
+             message(STATUS "Found SVE")
+-	        set(ARM_ARGS -O3 -march=armv8-a+sve -fPIC -flax-vector-conversions)
++	        set(ARM_ARGS -fPIC -flax-vector-conversions)
+             add_definitions(-DHAVE_SVE)
+             add_definitions(-DHAVE_NEON) # for NEON c/c++ primitives, as currently there is no implementation that use SVE
+         elseif(CPU_HAS_NEON)

 
@@ -0,0 +1,39 @@
+From: Sebastian Ramacher <sramacher@debian.org>
+Date: Sun, 21 Jun 2020 17:54:56 +0200
+Subject: Fix arm* flags
+
+---
+ source/CMakeLists.txt | 7 ++-----
+ 1 file changed, 2 insertions(+), 5 deletions(-)
+
+diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
+index ab5ddfe..eb9b19b 100755
+--- a/source/CMakeLists.txt
++++ b/source/CMakeLists.txt
+@@ -253,10 +253,7 @@ if(GCC)
+     elseif(ARM)
+         find_package(Neon)
+         if(CPU_HAS_NEON)
+-            set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=neon -marm -fPIC)
+             add_definitions(-DHAVE_NEON)
+-        else()
+-            set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=vfp -marm)
+         endif()
+     endif()
+   if(ARM64 OR CROSS_COMPILE_ARM64)
+@@ -265,13 +262,13 @@ if(GCC)
+         find_package(SVE2)
+         if(CPU_HAS_SVE2 OR CROSS_COMPILE_SVE2)
+             message(STATUS "Found SVE2")
+-          set(ARM_ARGS -O3 -march=armv8-a+sve2 -fPIC -flax-vector-conversions)
++          set(ARM_ARGS -fPIC -flax-vector-conversions)
+             add_definitions(-DHAVE_SVE2)
+             add_definitions(-DHAVE_SVE)
+             add_definitions(-DHAVE_NEON) # for NEON c/c++ primitives, as currently there is no implementation that use SVE2
+         elseif(CPU_HAS_SVE OR CROSS_COMPILE_SVE)
+             message(STATUS "Found SVE")
+-          set(ARM_ARGS -O3 -march=armv8-a+sve -fPIC -flax-vector-conversions)
++          set(ARM_ARGS -fPIC -flax-vector-conversions)
+             add_definitions(-DHAVE_SVE)
+             add_definitions(-DHAVE_NEON) # for NEON c/c++ primitives, as currently there is no implementation that use SVE
+         elseif(CPU_HAS_NEON)
​

0004-Do-not-build-with-assembly-support-on-arm.patch Added

 
@@ -0,0 +1,28 @@
+From: Sebastian Ramacher <sramacher@debian.org>
+Date: Fri, 31 May 2024 23:38:23 +0200
+Subject: Do not build with assembly support on arm*
+
+---
+ source/CMakeLists.txt | 9 ---------
+ 1 file changed, 9 deletions(-)
+
+diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
+index 672cc2d..f112330 100755
+--- a/source/CMakeLists.txt
++++ b/source/CMakeLists.txt
+@@ -73,15 +73,6 @@ elseif(POWERMATCH GREATER "-1")
+         add_definitions(-DPPC64=1)
+         message(STATUS "Detected POWER PPC64 target processor")
+     endif()
+-elseif(ARMMATCH GREATER "-1")
+-    if(CROSS_COMPILE_ARM)
+-        message(STATUS "Cross compiling for ARM arch")
+-    else()
+-        set(CROSS_COMPILE_ARM 0)
+-    endif()
+-  message(STATUS "Detected ARM target processor")
+-    set(ARM 1)
+-    add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=1)
+ elseif(ARM64MATCH GREATER "-1")
+     #if(CROSS_COMPILE_ARM64)
+         #message(STATUS "Cross compiling for ARM64 arch")
​

arm.patch Deleted

@@ -1,108 +0,0 @@
-Index: x265_3.4/source/CMakeLists.txt
-===================================================================
---- x265_3.4.orig/source/CMakeLists.txt
-+++ x265_3.4/source/CMakeLists.txt
-@@ -64,26 +64,26 @@ elseif(POWERMATCH GREATER "-1")
-         add_definitions(-DPPC64=1)
-         message(STATUS "Detected POWER PPC64 target processor")
-     endif()
--elseif(ARMMATCH GREATER "-1")
--    if(CROSS_COMPILE_ARM)
--        message(STATUS "Cross compiling for ARM arch")
--    else()
--        set(CROSS_COMPILE_ARM 0)
--    endif()
--    set(ARM 1)
--    if("${CMAKE_SIZEOF_VOID_P}" MATCHES 8)
--        message(STATUS "Detected ARM64 target processor")
--        set(ARM64 1)
--        add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=1 -DHAVE_ARMV6=0)
--    else()
--        message(STATUS "Detected ARM target processor")
--        add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=0 -DHAVE_ARMV6=1)
--    endif()
-+elseif(${SYSPROC} MATCHES "armv5.*")
-+    message(STATUS "Detected ARMV5 system processor")
-+    set(ARMV5 1)
-+    add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=0 -DHAVE_ARMV6=0 -DHAVE_NEON=0)
-+elseif(${SYSPROC} STREQUAL "armv6l")
-+    message(STATUS "Detected ARMV6 system processor")
-+    set(ARMV6 1)
-+    add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=0 -DHAVE_ARMV6=1 -DHAVE_NEON=0)
-+elseif(${SYSPROC} STREQUAL "armv7l")
-+    message(STATUS "Detected ARMV7 system processor")
-+    set(ARMV7 1)
-+    add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=0 -DHAVE_ARMV6=1 -DHAVE_NEON=0)
-+elseif(${SYSPROC} STREQUAL "aarch64")
-+    message(STATUS "Detected AArch64 system processor")
-+    set(ARMV7 1)
-+    add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=1 -DHAVE_ARMV6=0 -DHAVE_NEON=0)
- else()
-     message(STATUS "CMAKE_SYSTEM_PROCESSOR value `${CMAKE_SYSTEM_PROCESSOR}` is unknown")
-     message(STATUS "Please add this value near ${CMAKE_CURRENT_LIST_FILE}:${CMAKE_CURRENT_LIST_LINE}")
- endif()
--
- if(UNIX)
-     list(APPEND PLATFORM_LIBS pthread)
-     find_library(LIBRT rt)
-@@ -238,28 +238,9 @@ if(GCC)
-             endif()
-         endif()
-     endif()
--    if(ARM AND CROSS_COMPILE_ARM)
--        if(ARM64)
--            set(ARM_ARGS -fPIC)
--        else()
--            set(ARM_ARGS -march=armv6 -mfloat-abi=soft -mfpu=vfp -marm -fPIC)
--        endif()
--        message(STATUS "cross compile arm")
--    elseif(ARM)
--        if(ARM64)
--            set(ARM_ARGS -fPIC)
--            add_definitions(-DHAVE_NEON)
--        else()
--            find_package(Neon)
--            if(CPU_HAS_NEON)
--                set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=neon -marm -fPIC)
--                add_definitions(-DHAVE_NEON)
--            else()
--                set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=vfp -marm)
--            endif()
--        endif()
-+    if(ARMV7)
-+        add_definitions(-fPIC)
-     endif()
--    add_definitions(${ARM_ARGS})
-     if(FPROFILE_GENERATE)
-         if(INTEL_CXX)
-             add_definitions(-prof-gen -prof-dir="${CMAKE_CURRENT_BINARY_DIR}")
-Index: x265_3.4/source/common/cpu.cpp
-===================================================================
---- x265_3.4.orig/source/common/cpu.cpp
-+++ x265_3.4/source/common/cpu.cpp
-@@ -39,7 +39,7 @@
- #include <machine/cpu.h>
- #endif
- 
--#if X265_ARCH_ARM && !defined(HAVE_NEON)
-+#if X265_ARCH_ARM && (!defined(HAVE_NEON) || HAVE_NEON==0)
- #include <signal.h>
- #include <setjmp.h>
- static sigjmp_buf jmpbuf;
-@@ -350,7 +350,6 @@ uint32_t cpu_detect(bool benableavx512)
-     }
- 
-     canjump = 1;
--    PFX(cpu_neon_test)();
-     canjump = 0;
-     signal(SIGILL, oldsig);
- #endif // if !HAVE_NEON
-@@ -366,7 +365,7 @@ uint32_t cpu_detect(bool benableavx512)
-     // which may result in incorrect detection and the counters stuck enabled.
-     // right now Apple does not seem to support performance counters for this test
- #ifndef __MACH__
--    flags |= PFX(cpu_fast_neon_mrc_test)() ? X265_CPU_FAST_NEON_MRC : 0;
-+    //flags |= PFX(cpu_fast_neon_mrc_test)() ? X265_CPU_FAST_NEON_MRC : 0;
- #endif
-     // TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc)
- #elif X265_ARCH_ARM64

 
@@ -1,108 +0,0 @@
-Index: x265_3.4/source/CMakeLists.txt
-===================================================================
---- x265_3.4.orig/source/CMakeLists.txt
-+++ x265_3.4/source/CMakeLists.txt
-@@ -64,26 +64,26 @@ elseif(POWERMATCH GREATER "-1")
-         add_definitions(-DPPC64=1)
-         message(STATUS "Detected POWER PPC64 target processor")
-     endif()
--elseif(ARMMATCH GREATER "-1")
--    if(CROSS_COMPILE_ARM)
--        message(STATUS "Cross compiling for ARM arch")
--    else()
--        set(CROSS_COMPILE_ARM 0)
--    endif()
--    set(ARM 1)
--    if("${CMAKE_SIZEOF_VOID_P}" MATCHES 8)
--        message(STATUS "Detected ARM64 target processor")
--        set(ARM64 1)
--        add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=1 -DHAVE_ARMV6=0)
--    else()
--        message(STATUS "Detected ARM target processor")
--        add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=0 -DHAVE_ARMV6=1)
--    endif()
-+elseif(${SYSPROC} MATCHES "armv5.*")
-+    message(STATUS "Detected ARMV5 system processor")
-+    set(ARMV5 1)
-+    add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=0 -DHAVE_ARMV6=0 -DHAVE_NEON=0)
-+elseif(${SYSPROC} STREQUAL "armv6l")
-+    message(STATUS "Detected ARMV6 system processor")
-+    set(ARMV6 1)
-+    add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=0 -DHAVE_ARMV6=1 -DHAVE_NEON=0)
-+elseif(${SYSPROC} STREQUAL "armv7l")
-+    message(STATUS "Detected ARMV7 system processor")
-+    set(ARMV7 1)
-+    add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=0 -DHAVE_ARMV6=1 -DHAVE_NEON=0)
-+elseif(${SYSPROC} STREQUAL "aarch64")
-+    message(STATUS "Detected AArch64 system processor")
-+    set(ARMV7 1)
-+    add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=1 -DHAVE_ARMV6=0 -DHAVE_NEON=0)
- else()
-     message(STATUS "CMAKE_SYSTEM_PROCESSOR value `${CMAKE_SYSTEM_PROCESSOR}` is unknown")
-     message(STATUS "Please add this value near ${CMAKE_CURRENT_LIST_FILE}:${CMAKE_CURRENT_LIST_LINE}")
- endif()
--
- if(UNIX)
-     list(APPEND PLATFORM_LIBS pthread)
-     find_library(LIBRT rt)
-@@ -238,28 +238,9 @@ if(GCC)
-             endif()
-         endif()
-     endif()
--    if(ARM AND CROSS_COMPILE_ARM)
--        if(ARM64)
--            set(ARM_ARGS -fPIC)
--        else()
--            set(ARM_ARGS -march=armv6 -mfloat-abi=soft -mfpu=vfp -marm -fPIC)
--        endif()
--        message(STATUS "cross compile arm")
--    elseif(ARM)
--        if(ARM64)
--            set(ARM_ARGS -fPIC)
--            add_definitions(-DHAVE_NEON)
--        else()
--            find_package(Neon)
--            if(CPU_HAS_NEON)
--                set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=neon -marm -fPIC)
--                add_definitions(-DHAVE_NEON)
--            else()
--                set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=vfp -marm)
--            endif()
--        endif()
-+    if(ARMV7)
-+        add_definitions(-fPIC)
-     endif()
--    add_definitions(${ARM_ARGS})
-     if(FPROFILE_GENERATE)
-         if(INTEL_CXX)
-             add_definitions(-prof-gen -prof-dir="${CMAKE_CURRENT_BINARY_DIR}")
-Index: x265_3.4/source/common/cpu.cpp
-===================================================================
---- x265_3.4.orig/source/common/cpu.cpp
-+++ x265_3.4/source/common/cpu.cpp
-@@ -39,7 +39,7 @@
- #include <machine/cpu.h>
- #endif
- 
--#if X265_ARCH_ARM && !defined(HAVE_NEON)
-+#if X265_ARCH_ARM && (!defined(HAVE_NEON) || HAVE_NEON==0)
- #include <signal.h>
- #include <setjmp.h>
- static sigjmp_buf jmpbuf;
-@@ -350,7 +350,6 @@ uint32_t cpu_detect(bool benableavx512)
-     }
- 
-     canjump = 1;
--    PFX(cpu_neon_test)();
-     canjump = 0;
-     signal(SIGILL, oldsig);
- #endif // if !HAVE_NEON
-@@ -366,7 +365,7 @@ uint32_t cpu_detect(bool benableavx512)
-     // which may result in incorrect detection and the counters stuck enabled.
-     // right now Apple does not seem to support performance counters for this test
- #ifndef __MACH__
--    flags |= PFX(cpu_fast_neon_mrc_test)() ? X265_CPU_FAST_NEON_MRC : 0;
-+    //flags |= PFX(cpu_fast_neon_mrc_test)() ? X265_CPU_FAST_NEON_MRC : 0;
- #endif
-     // TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc)
- #elif X265_ARCH_ARM64
​

baselibs.conf Changed

 
@@ -1,1 +1,1 @@
-libx265-199
+libx265-209
​

x265_3.5.tar.gz/source/common/aarch64/ipfilter8.S Deleted

@@ -1,414 +0,0 @@
-/*****************************************************************************
- * Copyright (C) 2020 MulticoreWare, Inc
- *
- * Authors: Yimeng Su <yimeng.su@huawei.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at license @ x265.com.
- *****************************************************************************/
-
-#include "asm.S"
-
-.section .rodata
-
-.align 4
-
-.text
-
-
-
-.macro qpel_filter_0_32b
-    movi            v24.8h, #64
-    uxtl            v19.8h, v5.8b
-    smull           v17.4s, v19.4h, v24.4h
-    smull2          v18.4s, v19.8h, v24.8h
-.endm
-
-.macro qpel_filter_1_32b
-    movi            v16.8h, #58
-    uxtl            v19.8h, v5.8b
-    smull           v17.4s, v19.4h, v16.4h
-    smull2          v18.4s, v19.8h, v16.8h
-
-    movi            v24.8h, #10
-    uxtl            v21.8h, v1.8b
-    smull           v19.4s, v21.4h, v24.4h
-    smull2          v20.4s, v21.8h, v24.8h
-
-    movi            v16.8h, #17
-    uxtl            v23.8h, v2.8b
-    smull           v21.4s, v23.4h, v16.4h
-    smull2          v22.4s, v23.8h, v16.8h
-
-    movi            v24.8h, #5
-    uxtl            v1.8h, v6.8b
-    smull           v23.4s, v1.4h, v24.4h
-    smull2          v16.4s, v1.8h, v24.8h
-
-    sub             v17.4s, v17.4s, v19.4s
-    sub             v18.4s, v18.4s, v20.4s
-
-    uxtl            v1.8h, v4.8b
-    sshll           v19.4s, v1.4h, #2
-    sshll2          v20.4s, v1.8h, #2
-
-    add             v17.4s, v17.4s, v21.4s
-    add             v18.4s, v18.4s, v22.4s
-
-    uxtl            v1.8h, v0.8b
-    uxtl            v2.8h, v3.8b
-    ssubl           v21.4s, v2.4h, v1.4h
-    ssubl2          v22.4s, v2.8h, v1.8h
-
-    add             v17.4s, v17.4s, v19.4s
-    add             v18.4s, v18.4s, v20.4s
-    sub             v21.4s, v21.4s, v23.4s
-    sub             v22.4s, v22.4s, v16.4s
-    add             v17.4s, v17.4s, v21.4s
-    add             v18.4s, v18.4s, v22.4s
-.endm
-
-.macro qpel_filter_2_32b
-    movi            v16.4s, #11
-    uxtl            v19.8h, v5.8b
-    uxtl            v20.8h, v2.8b
-    saddl           v17.4s, v19.4h, v20.4h
-    saddl2          v18.4s, v19.8h, v20.8h
-
-    uxtl            v21.8h, v1.8b
-    uxtl            v22.8h, v6.8b
-    saddl           v19.4s, v21.4h, v22.4h
-    saddl2          v20.4s, v21.8h, v22.8h
-
-    mul             v19.4s, v19.4s, v16.4s
-    mul             v20.4s, v20.4s, v16.4s
-
-    movi            v16.4s, #40
-    mul             v17.4s, v17.4s, v16.4s
-    mul             v18.4s, v18.4s, v16.4s
-
-    uxtl            v21.8h, v4.8b
-    uxtl            v22.8h, v3.8b
-    saddl           v23.4s, v21.4h, v22.4h
-    saddl2          v16.4s, v21.8h, v22.8h
-
-    uxtl            v1.8h, v0.8b
-    uxtl            v2.8h, v7.8b
-    saddl           v21.4s, v1.4h, v2.4h
-    saddl2          v22.4s, v1.8h, v2.8h
-
-    shl             v23.4s, v23.4s, #2
-    shl             v16.4s, v16.4s, #2
-
-    add             v19.4s, v19.4s, v21.4s
-    add             v20.4s, v20.4s, v22.4s
-    add             v17.4s, v17.4s, v23.4s
-    add             v18.4s, v18.4s, v16.4s
-    sub             v17.4s, v17.4s, v19.4s
-    sub             v18.4s, v18.4s, v20.4s
-.endm
-
-.macro qpel_filter_3_32b
-    movi            v16.8h, #17
-    movi            v24.8h, #5
-
-    uxtl            v19.8h, v5.8b
-    smull           v17.4s, v19.4h, v16.4h
-    smull2          v18.4s, v19.8h, v16.8h
-
-    uxtl            v21.8h, v1.8b
-    smull           v19.4s, v21.4h, v24.4h
-    smull2          v20.4s, v21.8h, v24.8h
-
-    movi            v16.8h, #58
-    uxtl            v23.8h, v2.8b
-    smull           v21.4s, v23.4h, v16.4h
-    smull2          v22.4s, v23.8h, v16.8h
-
-    movi            v24.8h, #10
-    uxtl            v1.8h, v6.8b
-    smull           v23.4s, v1.4h, v24.4h
-    smull2          v16.4s, v1.8h, v24.8h
-
-    sub             v17.4s, v17.4s, v19.4s
-    sub             v18.4s, v18.4s, v20.4s
-
-    uxtl            v1.8h, v3.8b
-    sshll           v19.4s, v1.4h, #2
-    sshll2          v20.4s, v1.8h, #2
-
-    add             v17.4s, v17.4s, v21.4s
-    add             v18.4s, v18.4s, v22.4s
-
-    uxtl            v1.8h, v4.8b
-    uxtl            v2.8h, v7.8b
-    ssubl           v21.4s, v1.4h, v2.4h
-    ssubl2          v22.4s, v1.8h, v2.8h
-
-    add             v17.4s, v17.4s, v19.4s
-    add             v18.4s, v18.4s, v20.4s
-    sub             v21.4s, v21.4s, v23.4s
-    sub             v22.4s, v22.4s, v16.4s
-    add             v17.4s, v17.4s, v21.4s
-    add             v18.4s, v18.4s, v22.4s
-.endm
-
-
-
-
-.macro vextin8
-    ld1             {v3.16b}, x11, #16
-    mov             v7.d0, v3.d1
-    ext             v0.8b, v3.8b, v7.8b, #1
-    ext             v4.8b, v3.8b, v7.8b, #2
-    ext             v1.8b, v3.8b, v7.8b, #3
-    ext             v5.8b, v3.8b, v7.8b, #4
-    ext             v2.8b, v3.8b, v7.8b, #5
-    ext             v6.8b, v3.8b, v7.8b, #6
-    ext             v3.8b, v3.8b, v7.8b, #7
-.endm
-
-
-
-// void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
-.macro HPS_FILTER a b filterhps
-    mov             w12, #8192
-    mov             w6, w10
-    sub             x3, x3, #\a
-    lsl             x3, x3, #1
-    mov             w9, #\a
-    cmp             w9, #4
-    b.eq            14f
-    cmp             w9, #12
-    b.eq            15f
-    b               7f
-14:

 
@@ -1,414 +0,0 @@
-/*****************************************************************************
- * Copyright (C) 2020 MulticoreWare, Inc
- *
- * Authors: Yimeng Su <yimeng.su@huawei.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at license @ x265.com.
- *****************************************************************************/
-
-#include "asm.S"
-
-.section .rodata
-
-.align 4
-
-.text
-
-
-
-.macro qpel_filter_0_32b
-    movi            v24.8h, #64
-    uxtl            v19.8h, v5.8b
-    smull           v17.4s, v19.4h, v24.4h
-    smull2          v18.4s, v19.8h, v24.8h
-.endm
-
-.macro qpel_filter_1_32b
-    movi            v16.8h, #58
-    uxtl            v19.8h, v5.8b
-    smull           v17.4s, v19.4h, v16.4h
-    smull2          v18.4s, v19.8h, v16.8h
-
-    movi            v24.8h, #10
-    uxtl            v21.8h, v1.8b
-    smull           v19.4s, v21.4h, v24.4h
-    smull2          v20.4s, v21.8h, v24.8h
-
-    movi            v16.8h, #17
-    uxtl            v23.8h, v2.8b
-    smull           v21.4s, v23.4h, v16.4h
-    smull2          v22.4s, v23.8h, v16.8h
-
-    movi            v24.8h, #5
-    uxtl            v1.8h, v6.8b
-    smull           v23.4s, v1.4h, v24.4h
-    smull2          v16.4s, v1.8h, v24.8h
-
-    sub             v17.4s, v17.4s, v19.4s
-    sub             v18.4s, v18.4s, v20.4s
-
-    uxtl            v1.8h, v4.8b
-    sshll           v19.4s, v1.4h, #2
-    sshll2          v20.4s, v1.8h, #2
-
-    add             v17.4s, v17.4s, v21.4s
-    add             v18.4s, v18.4s, v22.4s
-
-    uxtl            v1.8h, v0.8b
-    uxtl            v2.8h, v3.8b
-    ssubl           v21.4s, v2.4h, v1.4h
-    ssubl2          v22.4s, v2.8h, v1.8h
-
-    add             v17.4s, v17.4s, v19.4s
-    add             v18.4s, v18.4s, v20.4s
-    sub             v21.4s, v21.4s, v23.4s
-    sub             v22.4s, v22.4s, v16.4s
-    add             v17.4s, v17.4s, v21.4s
-    add             v18.4s, v18.4s, v22.4s
-.endm
-
-.macro qpel_filter_2_32b
-    movi            v16.4s, #11
-    uxtl            v19.8h, v5.8b
-    uxtl            v20.8h, v2.8b
-    saddl           v17.4s, v19.4h, v20.4h
-    saddl2          v18.4s, v19.8h, v20.8h
-
-    uxtl            v21.8h, v1.8b
-    uxtl            v22.8h, v6.8b
-    saddl           v19.4s, v21.4h, v22.4h
-    saddl2          v20.4s, v21.8h, v22.8h
-
-    mul             v19.4s, v19.4s, v16.4s
-    mul             v20.4s, v20.4s, v16.4s
-
-    movi            v16.4s, #40
-    mul             v17.4s, v17.4s, v16.4s
-    mul             v18.4s, v18.4s, v16.4s
-
-    uxtl            v21.8h, v4.8b
-    uxtl            v22.8h, v3.8b
-    saddl           v23.4s, v21.4h, v22.4h
-    saddl2          v16.4s, v21.8h, v22.8h
-
-    uxtl            v1.8h, v0.8b
-    uxtl            v2.8h, v7.8b
-    saddl           v21.4s, v1.4h, v2.4h
-    saddl2          v22.4s, v1.8h, v2.8h
-
-    shl             v23.4s, v23.4s, #2
-    shl             v16.4s, v16.4s, #2
-
-    add             v19.4s, v19.4s, v21.4s
-    add             v20.4s, v20.4s, v22.4s
-    add             v17.4s, v17.4s, v23.4s
-    add             v18.4s, v18.4s, v16.4s
-    sub             v17.4s, v17.4s, v19.4s
-    sub             v18.4s, v18.4s, v20.4s
-.endm
-
-.macro qpel_filter_3_32b
-    movi            v16.8h, #17
-    movi            v24.8h, #5
-
-    uxtl            v19.8h, v5.8b
-    smull           v17.4s, v19.4h, v16.4h
-    smull2          v18.4s, v19.8h, v16.8h
-
-    uxtl            v21.8h, v1.8b
-    smull           v19.4s, v21.4h, v24.4h
-    smull2          v20.4s, v21.8h, v24.8h
-
-    movi            v16.8h, #58
-    uxtl            v23.8h, v2.8b
-    smull           v21.4s, v23.4h, v16.4h
-    smull2          v22.4s, v23.8h, v16.8h
-
-    movi            v24.8h, #10
-    uxtl            v1.8h, v6.8b
-    smull           v23.4s, v1.4h, v24.4h
-    smull2          v16.4s, v1.8h, v24.8h
-
-    sub             v17.4s, v17.4s, v19.4s
-    sub             v18.4s, v18.4s, v20.4s
-
-    uxtl            v1.8h, v3.8b
-    sshll           v19.4s, v1.4h, #2
-    sshll2          v20.4s, v1.8h, #2
-
-    add             v17.4s, v17.4s, v21.4s
-    add             v18.4s, v18.4s, v22.4s
-
-    uxtl            v1.8h, v4.8b
-    uxtl            v2.8h, v7.8b
-    ssubl           v21.4s, v1.4h, v2.4h
-    ssubl2          v22.4s, v1.8h, v2.8h
-
-    add             v17.4s, v17.4s, v19.4s
-    add             v18.4s, v18.4s, v20.4s
-    sub             v21.4s, v21.4s, v23.4s
-    sub             v22.4s, v22.4s, v16.4s
-    add             v17.4s, v17.4s, v21.4s
-    add             v18.4s, v18.4s, v22.4s
-.endm
-
-
-
-
-.macro vextin8
-    ld1             {v3.16b}, x11, #16
-    mov             v7.d0, v3.d1
-    ext             v0.8b, v3.8b, v7.8b, #1
-    ext             v4.8b, v3.8b, v7.8b, #2
-    ext             v1.8b, v3.8b, v7.8b, #3
-    ext             v5.8b, v3.8b, v7.8b, #4
-    ext             v2.8b, v3.8b, v7.8b, #5
-    ext             v6.8b, v3.8b, v7.8b, #6
-    ext             v3.8b, v3.8b, v7.8b, #7
-.endm
-
-
-
-// void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
-.macro HPS_FILTER a b filterhps
-    mov             w12, #8192
-    mov             w6, w10
-    sub             x3, x3, #\a
-    lsl             x3, x3, #1
-    mov             w9, #\a
-    cmp             w9, #4
-    b.eq            14f
-    cmp             w9, #12
-    b.eq            15f
-    b               7f
-14:
​

x265_3.5.tar.gz/source/common/aarch64/ipfilter8.h Deleted

@@ -1,55 +0,0 @@
-/*****************************************************************************
- * Copyright (C) 2020 MulticoreWare, Inc
- *
- * Authors: Yimeng Su <yimeng.su@huawei.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at license @ x265.com.
- *****************************************************************************/
-
-#ifndef X265_IPFILTER8_AARCH64_H
-#define X265_IPFILTER8_AARCH64_H
-
-
-void x265_interp_8tap_horiz_ps_4x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
-void x265_interp_8tap_horiz_ps_4x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
-void x265_interp_8tap_horiz_ps_4x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
-void x265_interp_8tap_horiz_ps_8x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
-void x265_interp_8tap_horiz_ps_8x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
-void x265_interp_8tap_horiz_ps_8x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
-void x265_interp_8tap_horiz_ps_8x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
-void x265_interp_8tap_horiz_ps_12x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
-void x265_interp_8tap_horiz_ps_16x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
-void x265_interp_8tap_horiz_ps_16x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
-void x265_interp_8tap_horiz_ps_16x12_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
-void x265_interp_8tap_horiz_ps_16x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
-void x265_interp_8tap_horiz_ps_16x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
-void x265_interp_8tap_horiz_ps_16x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
-void x265_interp_8tap_horiz_ps_24x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
-void x265_interp_8tap_horiz_ps_32x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
-void x265_interp_8tap_horiz_ps_32x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
-void x265_interp_8tap_horiz_ps_32x24_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
-void x265_interp_8tap_horiz_ps_32x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
-void x265_interp_8tap_horiz_ps_32x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
-void x265_interp_8tap_horiz_ps_48x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
-void x265_interp_8tap_horiz_ps_64x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
-void x265_interp_8tap_horiz_ps_64x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
-void x265_interp_8tap_horiz_ps_64x48_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
-void x265_interp_8tap_horiz_ps_64x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
-
-
-#endif // ifndef X265_IPFILTER8_AARCH64_H

 
@@ -1,55 +0,0 @@
-/*****************************************************************************
- * Copyright (C) 2020 MulticoreWare, Inc
- *
- * Authors: Yimeng Su <yimeng.su@huawei.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at license @ x265.com.
- *****************************************************************************/
-
-#ifndef X265_IPFILTER8_AARCH64_H
-#define X265_IPFILTER8_AARCH64_H
-
-
-void x265_interp_8tap_horiz_ps_4x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
-void x265_interp_8tap_horiz_ps_4x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
-void x265_interp_8tap_horiz_ps_4x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
-void x265_interp_8tap_horiz_ps_8x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
-void x265_interp_8tap_horiz_ps_8x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
-void x265_interp_8tap_horiz_ps_8x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
-void x265_interp_8tap_horiz_ps_8x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
-void x265_interp_8tap_horiz_ps_12x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
-void x265_interp_8tap_horiz_ps_16x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
-void x265_interp_8tap_horiz_ps_16x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
-void x265_interp_8tap_horiz_ps_16x12_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
-void x265_interp_8tap_horiz_ps_16x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
-void x265_interp_8tap_horiz_ps_16x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
-void x265_interp_8tap_horiz_ps_16x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
-void x265_interp_8tap_horiz_ps_24x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
-void x265_interp_8tap_horiz_ps_32x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
-void x265_interp_8tap_horiz_ps_32x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
-void x265_interp_8tap_horiz_ps_32x24_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
-void x265_interp_8tap_horiz_ps_32x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
-void x265_interp_8tap_horiz_ps_32x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
-void x265_interp_8tap_horiz_ps_48x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
-void x265_interp_8tap_horiz_ps_64x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
-void x265_interp_8tap_horiz_ps_64x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
-void x265_interp_8tap_horiz_ps_64x48_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
-void x265_interp_8tap_horiz_ps_64x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
-
-
-#endif // ifndef X265_IPFILTER8_AARCH64_H
​

x265_3.5.tar.gz/source/common/aarch64/pixel-util.h Deleted

@@ -1,40 +0,0 @@
-/*****************************************************************************
- * Copyright (C) 2020 MulticoreWare, Inc
- *
- * Authors: Yimeng Su <yimeng.su@huawei.com>
- *          Hongbin Liu <liuhongbin1@huawei.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at license @ x265.com.
- *****************************************************************************/
-
-#ifndef X265_PIXEL_UTIL_AARCH64_H
-#define X265_PIXEL_UTIL_AARCH64_H
-
-int x265_pixel_satd_4x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_4x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_4x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_4x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_8x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_8x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_12x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_12x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-
-uint32_t x265_quant_neon(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);
-int PFX(psyCost_4x4_neon)(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
-
-#endif // ifndef X265_PIXEL_UTIL_AARCH64_H

 
@@ -1,40 +0,0 @@
-/*****************************************************************************
- * Copyright (C) 2020 MulticoreWare, Inc
- *
- * Authors: Yimeng Su <yimeng.su@huawei.com>
- *          Hongbin Liu <liuhongbin1@huawei.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at license @ x265.com.
- *****************************************************************************/
-
-#ifndef X265_PIXEL_UTIL_AARCH64_H
-#define X265_PIXEL_UTIL_AARCH64_H
-
-int x265_pixel_satd_4x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_4x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_4x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_4x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_8x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_8x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_12x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_12x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-
-uint32_t x265_quant_neon(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);
-int PFX(psyCost_4x4_neon)(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
-
-#endif // ifndef X265_PIXEL_UTIL_AARCH64_H
​

x265_3.5.tar.gz/source/common/aarch64/pixel.h Deleted

@@ -1,105 +0,0 @@
-/*****************************************************************************
- * Copyright (C) 2020 MulticoreWare, Inc
- *
- * Authors: Hongbin Liu <liuhongbin1@huawei.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at license @ x265.com.
- *****************************************************************************/
-
-#ifndef X265_I386_PIXEL_AARCH64_H
-#define X265_I386_PIXEL_AARCH64_H
-
-void x265_pixel_avg_pp_4x4_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
-void x265_pixel_avg_pp_4x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
-void x265_pixel_avg_pp_4x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
-void x265_pixel_avg_pp_8x4_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
-void x265_pixel_avg_pp_8x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
-void x265_pixel_avg_pp_8x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
-void x265_pixel_avg_pp_8x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
-void x265_pixel_avg_pp_12x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
-void x265_pixel_avg_pp_16x4_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
-void x265_pixel_avg_pp_16x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
-void x265_pixel_avg_pp_16x12_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
-void x265_pixel_avg_pp_16x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
-void x265_pixel_avg_pp_16x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
-void x265_pixel_avg_pp_16x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
-void x265_pixel_avg_pp_24x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
-void x265_pixel_avg_pp_32x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
-void x265_pixel_avg_pp_32x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
-void x265_pixel_avg_pp_32x24_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
-void x265_pixel_avg_pp_32x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
-void x265_pixel_avg_pp_32x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
-void x265_pixel_avg_pp_48x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
-void x265_pixel_avg_pp_64x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
-void x265_pixel_avg_pp_64x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
-void x265_pixel_avg_pp_64x48_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
-void x265_pixel_avg_pp_64x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
-
-void x265_sad_x3_4x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
-void x265_sad_x3_4x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
-void x265_sad_x3_4x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
-void x265_sad_x3_8x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
-void x265_sad_x3_8x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
-void x265_sad_x3_8x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
-void x265_sad_x3_8x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
-void x265_sad_x3_12x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
-void x265_sad_x3_16x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
-void x265_sad_x3_16x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
-void x265_sad_x3_16x12_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
-void x265_sad_x3_16x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
-void x265_sad_x3_16x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
-void x265_sad_x3_16x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
-void x265_sad_x3_24x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
-void x265_sad_x3_32x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
-void x265_sad_x3_32x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
-void x265_sad_x3_32x24_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
-void x265_sad_x3_32x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
-void x265_sad_x3_32x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
-void x265_sad_x3_48x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
-void x265_sad_x3_64x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
-void x265_sad_x3_64x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
-void x265_sad_x3_64x48_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
-void x265_sad_x3_64x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
-
-void x265_sad_x4_4x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
-void x265_sad_x4_4x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
-void x265_sad_x4_4x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
-void x265_sad_x4_8x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
-void x265_sad_x4_8x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
-void x265_sad_x4_8x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
-void x265_sad_x4_8x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
-void x265_sad_x4_12x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
-void x265_sad_x4_16x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
-void x265_sad_x4_16x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
-void x265_sad_x4_16x12_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
-void x265_sad_x4_16x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
-void x265_sad_x4_16x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
-void x265_sad_x4_16x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
-void x265_sad_x4_24x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
-void x265_sad_x4_32x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
-void x265_sad_x4_32x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
-void x265_sad_x4_32x24_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
-void x265_sad_x4_32x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
-void x265_sad_x4_32x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
-void x265_sad_x4_48x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
-void x265_sad_x4_64x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
-void x265_sad_x4_64x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
-void x265_sad_x4_64x48_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
-void x265_sad_x4_64x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
-
-#endif // ifndef X265_I386_PIXEL_AARCH64_H

 
@@ -1,105 +0,0 @@
-/*****************************************************************************
- * Copyright (C) 2020 MulticoreWare, Inc
- *
- * Authors: Hongbin Liu <liuhongbin1@huawei.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at license @ x265.com.
- *****************************************************************************/
-
-#ifndef X265_I386_PIXEL_AARCH64_H
-#define X265_I386_PIXEL_AARCH64_H
-
-void x265_pixel_avg_pp_4x4_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
-void x265_pixel_avg_pp_4x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
-void x265_pixel_avg_pp_4x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
-void x265_pixel_avg_pp_8x4_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
-void x265_pixel_avg_pp_8x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
-void x265_pixel_avg_pp_8x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
-void x265_pixel_avg_pp_8x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
-void x265_pixel_avg_pp_12x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
-void x265_pixel_avg_pp_16x4_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
-void x265_pixel_avg_pp_16x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
-void x265_pixel_avg_pp_16x12_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
-void x265_pixel_avg_pp_16x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
-void x265_pixel_avg_pp_16x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
-void x265_pixel_avg_pp_16x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
-void x265_pixel_avg_pp_24x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
-void x265_pixel_avg_pp_32x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
-void x265_pixel_avg_pp_32x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
-void x265_pixel_avg_pp_32x24_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
-void x265_pixel_avg_pp_32x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
-void x265_pixel_avg_pp_32x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
-void x265_pixel_avg_pp_48x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
-void x265_pixel_avg_pp_64x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
-void x265_pixel_avg_pp_64x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
-void x265_pixel_avg_pp_64x48_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
-void x265_pixel_avg_pp_64x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
-
-void x265_sad_x3_4x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
-void x265_sad_x3_4x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
-void x265_sad_x3_4x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
-void x265_sad_x3_8x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
-void x265_sad_x3_8x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
-void x265_sad_x3_8x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
-void x265_sad_x3_8x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
-void x265_sad_x3_12x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
-void x265_sad_x3_16x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
-void x265_sad_x3_16x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
-void x265_sad_x3_16x12_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
-void x265_sad_x3_16x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
-void x265_sad_x3_16x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
-void x265_sad_x3_16x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
-void x265_sad_x3_24x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
-void x265_sad_x3_32x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
-void x265_sad_x3_32x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
-void x265_sad_x3_32x24_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
-void x265_sad_x3_32x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
-void x265_sad_x3_32x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
-void x265_sad_x3_48x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
-void x265_sad_x3_64x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
-void x265_sad_x3_64x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
-void x265_sad_x3_64x48_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
-void x265_sad_x3_64x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
-
-void x265_sad_x4_4x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
-void x265_sad_x4_4x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
-void x265_sad_x4_4x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
-void x265_sad_x4_8x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
-void x265_sad_x4_8x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
-void x265_sad_x4_8x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
-void x265_sad_x4_8x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
-void x265_sad_x4_12x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
-void x265_sad_x4_16x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
-void x265_sad_x4_16x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
-void x265_sad_x4_16x12_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
-void x265_sad_x4_16x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
-void x265_sad_x4_16x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
-void x265_sad_x4_16x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
-void x265_sad_x4_24x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
-void x265_sad_x4_32x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
-void x265_sad_x4_32x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
-void x265_sad_x4_32x24_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
-void x265_sad_x4_32x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
-void x265_sad_x4_32x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
-void x265_sad_x4_48x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
-void x265_sad_x4_64x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
-void x265_sad_x4_64x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
-void x265_sad_x4_64x48_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
-void x265_sad_x4_64x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
-
-#endif // ifndef X265_I386_PIXEL_AARCH64_H
​

x265_3.6.tar.gz/.gitignore Added

 
@@ -0,0 +1,36 @@
+# Prerequisites
+*.d
+
+# Compiled Object files
+*.slo
+*.lo
+*.o
+*.obj
+
+# Precompiled Headers
+*.gch
+*.pch
+
+# Compiled Dynamic libraries
+*.so
+*.dylib
+*.dll
+
+# Fortran module files
+*.mod
+*.smod
+
+# Compiled Static libraries
+*.lai
+*.la
+*.a
+*.lib
+
+# Executables
+*.exe
+*.out
+*.app
+
+# Build directory
+build/
+
​

x265_3.5.tar.gz/build/README.txt -> x265_3.6.tar.gz/build/README.txt Changed

@@ -6,6 +6,9 @@
 
 Note: MSVC12 requires cmake 2.8.11 or later
 
+Note: When the SVE/SVE2 instruction set of Arm AArch64 architecture is to be used, the GCC10.x and onwards must
+      be installed in order to compile x265.
+
 
 = Optional Prerequisites =
 
@@ -88,3 +91,25 @@
 building out of a Mercurial source repository.  If you are building out of
 a release source package, the version will not change.  If Mercurial is not
 found, the version will be "unknown".
+
+= Build Instructions for cross-compilation for Arm AArch64 Targets=
+
+When the target platform is based on Arm AArch64 architecture, the x265 can be
+built in x86 platforms. However, the CMAKE_C_COMPILER and CMAKE_CXX_COMPILER
+enviroment variables should be set to point to the cross compilers of the
+appropriate gcc. For example:
+
+1. export CMAKE_C_COMPILER=aarch64-unknown-linux-gnu-gcc
+2. export CMAKE_CXX_COMPILER=aarch64-unknown-linux-gnu-g++
+
+The default ones are aarch64-linux-gnu-gcc and aarch64-linux-gnu-g++.
+Then, the normal building process can be followed.
+
+Moreover, if the target platform supports SVE or SVE2 instruction set, the
+CROSS_COMPILE_SVE or CROSS_COMPILE_SVE2 environment variables should be set
+to true, respectively. For example:
+
+1. export CROSS_COMPILE_SVE2=true
+2. export CROSS_COMPILE_SVE=true
+
+Then, the normal building process can be followed.

 
@@ -6,6 +6,9 @@
 
 Note: MSVC12 requires cmake 2.8.11 or later
 
+Note: When the SVE/SVE2 instruction set of Arm AArch64 architecture is to be used, the GCC10.x and onwards must
+      be installed in order to compile x265.
+
 
 = Optional Prerequisites =
 
@@ -88,3 +91,25 @@
 building out of a Mercurial source repository.  If you are building out of
 a release source package, the version will not change.  If Mercurial is not
 found, the version will be "unknown".
+
+= Build Instructions for cross-compilation for Arm AArch64 Targets=
+
+When the target platform is based on Arm AArch64 architecture, the x265 can be
+built in x86 platforms. However, the CMAKE_C_COMPILER and CMAKE_CXX_COMPILER
+enviroment variables should be set to point to the cross compilers of the
+appropriate gcc. For example:
+
+1. export CMAKE_C_COMPILER=aarch64-unknown-linux-gnu-gcc
+2. export CMAKE_CXX_COMPILER=aarch64-unknown-linux-gnu-g++
+
+The default ones are aarch64-linux-gnu-gcc and aarch64-linux-gnu-g++.
+Then, the normal building process can be followed.
+
+Moreover, if the target platform supports SVE or SVE2 instruction set, the
+CROSS_COMPILE_SVE or CROSS_COMPILE_SVE2 environment variables should be set
+to true, respectively. For example:
+
+1. export CROSS_COMPILE_SVE2=true
+2. export CROSS_COMPILE_SVE=true
+
+Then, the normal building process can be followed.
​

x265_3.6.tar.gz/build/aarch64-darwin Added

 
+(directory)
​

x265_3.6.tar.gz/build/aarch64-darwin/crosscompile.cmake Added

 
@@ -0,0 +1,23 @@
+# CMake toolchain file for cross compiling x265 for aarch64
+# This feature is only supported as experimental. Use with caution.
+# Please report bugs on bitbucket
+# Run cmake with: cmake -DCMAKE_TOOLCHAIN_FILE=crosscompile.cmake -G "Unix Makefiles" ../../source && ccmake ../../source
+
+set(CROSS_COMPILE_ARM64 1)
+set(CMAKE_SYSTEM_NAME Darwin)
+set(CMAKE_SYSTEM_PROCESSOR aarch64)
+
+# specify the cross compiler
+set(CMAKE_C_COMPILER gcc-12)
+set(CMAKE_CXX_COMPILER g++-12)
+
+# specify the target environment
+SET(CMAKE_FIND_ROOT_PATH  /opt/homebrew/bin/)
+
+# specify whether SVE/SVE2 is supported by the target platform
+if(DEFINED ENV{CROSS_COMPILE_SVE2})
+    set(CROSS_COMPILE_SVE2 1)
+elseif(DEFINED ENV{CROSS_COMPILE_SVE})
+    set(CROSS_COMPILE_SVE 1)
+endif()
+
​

x265_3.6.tar.gz/build/aarch64-darwin/make-Makefiles.bash Added

 
@@ -0,0 +1,4 @@
+#!/bin/bash
+# Run this from within a bash shell
+
+cmake -DCMAKE_TOOLCHAIN_FILE="crosscompile.cmake" -G "Unix Makefiles" ../../source && ccmake ../../source
​

x265_3.5.tar.gz/build/aarch64-linux/crosscompile.cmake -> x265_3.6.tar.gz/build/aarch64-linux/crosscompile.cmake Changed

@@ -3,13 +3,29 @@
 # Please report bugs on bitbucket
 # Run cmake with: cmake -DCMAKE_TOOLCHAIN_FILE=crosscompile.cmake -G "Unix Makefiles" ../../source && ccmake ../../source
 
-set(CROSS_COMPILE_ARM 1)
+set(CROSS_COMPILE_ARM64 1)
 set(CMAKE_SYSTEM_NAME Linux)
 set(CMAKE_SYSTEM_PROCESSOR aarch64)
 
 # specify the cross compiler
-set(CMAKE_C_COMPILER aarch64-linux-gnu-gcc)
-set(CMAKE_CXX_COMPILER aarch64-linux-gnu-g++)
+if(DEFINED ENV{CMAKE_C_COMPILER})
+    set(CMAKE_C_COMPILER $ENV{CMAKE_C_COMPILER})
+else()
+    set(CMAKE_C_COMPILER aarch64-linux-gnu-gcc)
+endif()
+if(DEFINED ENV{CMAKE_CXX_COMPILER})
+    set(CMAKE_CXX_COMPILER $ENV{CMAKE_CXX_COMPILER})
+else()
+    set(CMAKE_CXX_COMPILER aarch64-linux-gnu-g++)
+endif()
 
 # specify the target environment
 SET(CMAKE_FIND_ROOT_PATH  /usr/aarch64-linux-gnu)
+
+# specify whether SVE/SVE2 is supported by the target platform
+if(DEFINED ENV{CROSS_COMPILE_SVE2})
+    set(CROSS_COMPILE_SVE2 1)
+elseif(DEFINED ENV{CROSS_COMPILE_SVE})
+    set(CROSS_COMPILE_SVE 1)
+endif()
+

 
@@ -3,13 +3,29 @@
 # Please report bugs on bitbucket
 # Run cmake with: cmake -DCMAKE_TOOLCHAIN_FILE=crosscompile.cmake -G "Unix Makefiles" ../../source && ccmake ../../source
 
-set(CROSS_COMPILE_ARM 1)
+set(CROSS_COMPILE_ARM64 1)
 set(CMAKE_SYSTEM_NAME Linux)
 set(CMAKE_SYSTEM_PROCESSOR aarch64)
 
 # specify the cross compiler
-set(CMAKE_C_COMPILER aarch64-linux-gnu-gcc)
-set(CMAKE_CXX_COMPILER aarch64-linux-gnu-g++)
+if(DEFINED ENV{CMAKE_C_COMPILER})
+    set(CMAKE_C_COMPILER $ENV{CMAKE_C_COMPILER})
+else()
+    set(CMAKE_C_COMPILER aarch64-linux-gnu-gcc)
+endif()
+if(DEFINED ENV{CMAKE_CXX_COMPILER})
+    set(CMAKE_CXX_COMPILER $ENV{CMAKE_CXX_COMPILER})
+else()
+    set(CMAKE_CXX_COMPILER aarch64-linux-gnu-g++)
+endif()
 
 # specify the target environment
 SET(CMAKE_FIND_ROOT_PATH  /usr/aarch64-linux-gnu)
+
+# specify whether SVE/SVE2 is supported by the target platform
+if(DEFINED ENV{CROSS_COMPILE_SVE2})
+    set(CROSS_COMPILE_SVE2 1)
+elseif(DEFINED ENV{CROSS_COMPILE_SVE})
+    set(CROSS_COMPILE_SVE 1)
+endif()
+
​

x265_3.5.tar.gz/build/arm-linux/make-Makefiles.bash -> x265_3.6.tar.gz/build/arm-linux/make-Makefiles.bash Changed

 
@@ -1,4 +1,4 @@
 #!/bin/bash
 # Run this from within a bash shell
 
-cmake -G "Unix Makefiles" ../../source && ccmake ../../source
+cmake -DCMAKE_TOOLCHAIN_FILE="crosscompile.cmake" -G "Unix Makefiles" ../../source && ccmake ../../source
​

x265_3.5.tar.gz/doc/reST/cli.rst -> x265_3.6.tar.gz/doc/reST/cli.rst Changed

@@ -632,9 +632,8 @@
 	auto-detection by the encoder. If specified, the encoder will
 	attempt to bring the encode specifications within that specified
 	level. If the encoder is unable to reach the level it issues a
-	warning and aborts the encode. If the requested requirement level is
-	higher than the actual level, the actual requirement level is
-	signaled.
+	warning and aborts the encode. The requested level will be signaled 
+	in the bitstream even if it is higher than the actual level.
 
 	Beware, specifying a decoder level will force the encoder to enable
 	VBV for constant rate factor encodes, which may introduce
@@ -714,11 +713,8 @@
 	(main, main10, etc). Second, an encoder is created from this
 	x265_param instance and the :option:`--level-idc` and
 	:option:`--high-tier` parameters are used to reduce bitrate or other
-	features in order to enforce the target level. Finally, the encoder
-	re-examines the final set of parameters and detects the actual
-	minimum decoder requirement level and this is what is signaled in
-	the bitstream headers. The detected decoder level will only use High
-	tier if the user specified a High tier level.
+	features in order to enforce the target level. The detected decoder level
+	will only use High tier if the user specified a High tier level.
 
 	The signaled profile will be determined by the encoder's internal
 	bitdepth and input color space. If :option:`--keyint` is 0 or 1,
@@ -961,21 +957,21 @@
 	Note that :option:`--analysis-save-reuse-level` and :option:`--analysis-load-reuse-level` must be paired
 	with :option:`--analysis-save` and :option:`--analysis-load` respectively.
 
-	+--------------+------------------------------------------+
-	| Level        | Description                              |
-	+==============+==========================================+
-	| 1            | Lookahead information                    |
-	+--------------+------------------------------------------+
-	| 2 to 4       | Level 1 + intra/inter modes, ref's       |
-	+--------------+------------------------------------------+
-	| 5 and 6      | Level 2 + rect-amp                       |
-	+--------------+------------------------------------------+
-	| 7            | Level 5 + AVC size CU refinement         |
-	+--------------+------------------------------------------+
-	| 8 and 9      | Level 5 + AVC size Full CU analysis-info |
-	+--------------+------------------------------------------+
-	| 10           | Level 5 + Full CU analysis-info          |
-	+--------------+------------------------------------------+
+	+--------------+---------------------------------------------------+
+	| Level        | Description                                       |
+	+==============+===================================================+
+	| 1            | Lookahead information                             |
+	+--------------+---------------------------------------------------+
+	| 2 to 4       | Level 1 + intra/inter modes, depth, ref's, cutree |
+	+--------------+---------------------------------------------------+
+	| 5 and 6      | Level 2 + rect-amp                                |
+	+--------------+---------------------------------------------------+
+	| 7            | Level 5 + AVC size CU refinement                  |
+	+--------------+---------------------------------------------------+
+	| 8 and 9      | Level 5 + AVC size Full CU analysis-info          |
+	+--------------+---------------------------------------------------+
+	| 10           | Level 5 + Full CU analysis-info                   |
+	+--------------+---------------------------------------------------+
 
 .. option:: --refine-mv-type <string>
 
@@ -1332,6 +1328,11 @@
 	Search range for HME level 0, 1 and 2.
 	The Search Range for each HME level must be between 0 and 32768(excluding).
 	Default search range is 16,32,48 for level 0,1,2 respectively.
+	
+.. option:: --mcstf, --no-mcstf
+
+    Enable Motion Compensated Temporal filtering.
+	Default: disabled
 
 Spatial/intra options
 =====================
@@ -1473,17 +1474,9 @@
 
 .. option:: --hist-scenecut, --no-hist-scenecut
 
-	Indicates that scenecuts need to be detected using luma edge and chroma histograms.
-	:option:`--hist-scenecut` enables scenecut detection using the histograms and disables the default scene cut algorithm.
-	:option:`--no-hist-scenecut` disables histogram based scenecut algorithm.
-	
-.. option:: --hist-threshold <0.0..1.0>
-
-	This value represents the threshold for normalized SAD of edge histograms used in scenecut detection.
-	This requires :option:`--hist-scenecut` to be enabled. For example, a value of 0.2 indicates that a frame with normalized SAD value 
-	greater than 0.2 against the previous frame as scenecut. 
-	Increasing the threshold reduces the number of scenecuts detected.
-	Default 0.03.
+	Scenecuts detected based on histogram, intensity and variance of the picture.
+	:option:`--hist-scenecut` enables or :option:`--no-hist-scenecut` disables scenecut detection based on
+	histogram.
 	
 .. option:: --radl <integer>
 	
@@ -1766,6 +1759,12 @@
 	Default 1.0.
 	**Range of values:** 0.0 to 3.0
 
+.. option:: --sbrc --no-sbrc
+
+	To enable and disable segment based rate control.Segment duration depends on the
+	keyframe interval specified.If unspecified,default keyframe interval will be used.
+	Default: disabled.
+
 .. option:: --hevc-aq
 
 	Enable adaptive quantization
@@ -1976,12 +1975,18 @@
 	
 	**CLI ONLY**
 
+.. option:: --scenecut-qp-config <filename>
+
+	Specify a text file which contains the scenecut aware QP options.
+	The options include :option:`--scenecut-aware-qp` and :option:`--masking-strength`
+
+	**CLI ONLY**
+
 .. option:: --scenecut-aware-qp <integer>
 
 	It reduces the bits spent on the inter-frames within the scenecut window
 	before and after a scenecut by increasing their QP in ratecontrol pass2 algorithm
-	without any deterioration in visual quality. If a scenecut falls within the window,
-	the QP of the inter-frames after this scenecut will not be modified.
+	without any deterioration in visual quality.
 	:option:`--scenecut-aware-qp` works only with --pass 2. Default 0.
 
 	+-------+---------------------------------------------------------------+
@@ -2006,48 +2011,83 @@
 	for the QP increment for inter-frames when :option:`--scenecut-aware-qp`
 	is enabled.
 
-	When :option:`--scenecut-aware-qp` is::
+	When :option:`--scenecut-aware-qp` is:
+
 	* 1 (Forward masking):
-	--masking-strength <fwdWindow,fwdRefQPDelta,fwdNonRefQPDelta>
+	--masking-strength <fwdMaxWindow,fwdRefQPDelta,fwdNonRefQPDelta>
+	or 
+	--masking-strength <fwdWindow1,fwdRefQPDelta1,fwdNonRefQPDelta1,fwdWindow2,fwdRefQPDelta2,fwdNonRefQPDelta2,
+						fwdWindow3,fwdRefQPDelta3,fwdNonRefQPDelta3,fwdWindow4,fwdRefQPDelta4,fwdNonRefQPDelta4,
+						fwdWindow5,fwdRefQPDelta5,fwdNonRefQPDelta5,fwdWindow6,fwdRefQPDelta6,fwdNonRefQPDelta6>
 	* 2 (Backward masking):
-	--masking-strength <bwdWindow,bwdRefQPDelta,bwdNonRefQPDelta>
+	--masking-strength <bwdMaxWindow,bwdRefQPDelta,bwdNonRefQPDelta>
+	or 
+	--masking-strength <bwdWindow1,bwdRefQPDelta1,bwdNonRefQPDelta1,bwdWindow2,bwdRefQPDelta2,bwdNonRefQPDelta2,
+						bwdWindow3,bwdRefQPDelta3,bwdNonRefQPDelta3,bwdWindow4,bwdRefQPDelta4,bwdNonRefQPDelta4,
+						bwdWindow5,bwdRefQPDelta5,bwdNonRefQPDelta5,bwdWindow6,bwdRefQPDelta6,bwdNonRefQPDelta6>
 	* 3 (Bi-directional masking):
-	--masking-strength <fwdWindow,fwdRefQPDelta,fwdNonRefQPDelta,bwdWindow,bwdRefQPDelta,bwdNonRefQPDelta>
+	--masking-strength <fwdMaxWindow,fwdRefQPDelta,fwdNonRefQPDelta,bwdMaxWindow,bwdRefQPDelta,bwdNonRefQPDelta>
+	or 
+	--masking-strength <fwdWindow1,fwdRefQPDelta1,fwdNonRefQPDelta1,fwdWindow2,fwdRefQPDelta2,fwdNonRefQPDelta2,
+						fwdWindow3,fwdRefQPDelta3,fwdNonRefQPDelta3,fwdWindow4,fwdRefQPDelta4,fwdNonRefQPDelta4,
+						fwdWindow5,fwdRefQPDelta5,fwdNonRefQPDelta5,fwdWindow6,fwdRefQPDelta6,fwdNonRefQPDelta6,
+						bwdWindow1,bwdRefQPDelta1,bwdNonRefQPDelta1,bwdWindow2,bwdRefQPDelta2,bwdNonRefQPDelta2,
+						bwdWindow3,bwdRefQPDelta3,bwdNonRefQPDelta3,bwdWindow4,bwdRefQPDelta4,bwdNonRefQPDelta4,
+						bwdWindow5,bwdRefQPDelta5,bwdNonRefQPDelta5,bwdWindow6,bwdRefQPDelta6,bwdNonRefQPDelta6>
 
 	+-----------------+---------------------------------------------------------------+
 	| Parameter       | Description                                                   |
 	+=================+===============================================================+
-	| fwdWindow       | The duration(in milliseconds) for which there is a reduction  |
-	|                 | in the bits spent on the inter-frames after a scenecut by     |
-	|                 | increasing their QP. Default 500ms.                           |
-	|                 | **Range of values:** 0 to 1000                                |
+	| fwdMaxWindow    | The maximum duration(in milliseconds) for which there is a    |
+	|                 | reduction in the bits spent on the inter-frames after a       |
+	|                 | scenecut by increasing their QP. Default 500ms.               |
+	|                 | **Range of values:** 0 to 2000                                |
+	+-----------------+---------------------------------------------------------------+
+	| fwdWindow       | The duration of a sub-window(in milliseconds) for which there |
+	|                 | is a reduction in the bits spent on the inter-frames after a  |
+	|                 | scenecut by increasing their QP. Default 500ms.               |
+	|                 | **Range of values:** 0 to 2000                                |
 	+-----------------+---------------------------------------------------------------+
 	| fwdRefQPDelta   | The offset by which QP is incremented for inter-frames        |
 	|                 | after a scenecut. Default 5.                                  |
-	|                 | **Range of values:** 0 to 10                                  |
+	|                 | **Range of values:** 0 to 20                                  |
 	+-----------------+---------------------------------------------------------------+
 	| fwdNonRefQPDelta| The offset by which QP is incremented for non-referenced      |
 	|                 | inter-frames after a scenecut. The offset is computed from    |
 	|                 | fwdRefQPDelta when it is not explicitly specified.            |
-	|                 | **Range of values:** 0 to 10                                  |
+	|                 | **Range of values:** 0 to 20                                  |
+	+-----------------+---------------------------------------------------------------+
+	| bwdMaxWindow    | The maximum duration(in milliseconds) for which there is a    |
+	|                 | reduction in the bits spent on the inter-frames before a      |
+	|                 | scenecut by increasing their QP. Default 100ms.               |
+	|                 | **Range of values:** 0 to 2000                                |
 	+-----------------+---------------------------------------------------------------+
-	| bwdWindow       | The duration(in milliseconds) for which there is a reduction  |
-	|                 | in the bits spent on the inter-frames before a scenecut by    |
-	|                 | increasing their QP. Default 100ms.                           |
-	|                 | **Range of values:** 0 to 1000                                |
+	| bwdWindow       | The duration of a sub-window(in milliseconds) for which there |

 
@@ -632,9 +632,8 @@
    auto-detection by the encoder. If specified, the encoder will
    attempt to bring the encode specifications within that specified
    level. If the encoder is unable to reach the level it issues a
-   warning and aborts the encode. If the requested requirement level is
-   higher than the actual level, the actual requirement level is
-   signaled.
+   warning and aborts the encode. The requested level will be signaled 
+   in the bitstream even if it is higher than the actual level.
 
    Beware, specifying a decoder level will force the encoder to enable
    VBV for constant rate factor encodes, which may introduce
@@ -714,11 +713,8 @@
    (main, main10, etc). Second, an encoder is created from this
    x265_param instance and the :option:`--level-idc` and
    :option:`--high-tier` parameters are used to reduce bitrate or other
-   features in order to enforce the target level. Finally, the encoder
-   re-examines the final set of parameters and detects the actual
-   minimum decoder requirement level and this is what is signaled in
-   the bitstream headers. The detected decoder level will only use High
-   tier if the user specified a High tier level.
+   features in order to enforce the target level. The detected decoder level
+   will only use High tier if the user specified a High tier level.
 
    The signaled profile will be determined by the encoder's internal
    bitdepth and input color space. If :option:`--keyint` is 0 or 1,
@@ -961,21 +957,21 @@
    Note that :option:`--analysis-save-reuse-level` and :option:`--analysis-load-reuse-level` must be paired
    with :option:`--analysis-save` and :option:`--analysis-load` respectively.
 
-   +--------------+------------------------------------------+
-   | Level        | Description                              |
-   +==============+==========================================+
-   | 1            | Lookahead information                    |
-   +--------------+------------------------------------------+
-   | 2 to 4       | Level 1 + intra/inter modes, ref's       |
-   +--------------+------------------------------------------+
-   | 5 and 6      | Level 2 + rect-amp                       |
-   +--------------+------------------------------------------+
-   | 7            | Level 5 + AVC size CU refinement         |
-   +--------------+------------------------------------------+
-   | 8 and 9      | Level 5 + AVC size Full CU analysis-info |
-   +--------------+------------------------------------------+
-   | 10           | Level 5 + Full CU analysis-info          |
-   +--------------+------------------------------------------+
+   +--------------+---------------------------------------------------+
+   | Level        | Description                                       |
+   +==============+===================================================+
+   | 1            | Lookahead information                             |
+   +--------------+---------------------------------------------------+
+   | 2 to 4       | Level 1 + intra/inter modes, depth, ref's, cutree |
+   +--------------+---------------------------------------------------+
+   | 5 and 6      | Level 2 + rect-amp                                |
+   +--------------+---------------------------------------------------+
+   | 7            | Level 5 + AVC size CU refinement                  |
+   +--------------+---------------------------------------------------+
+   | 8 and 9      | Level 5 + AVC size Full CU analysis-info          |
+   +--------------+---------------------------------------------------+
+   | 10           | Level 5 + Full CU analysis-info                   |
+   +--------------+---------------------------------------------------+
 
 .. option:: --refine-mv-type <string>
 
@@ -1332,6 +1328,11 @@
    Search range for HME level 0, 1 and 2.
    The Search Range for each HME level must be between 0 and 32768(excluding).
    Default search range is 16,32,48 for level 0,1,2 respectively.
+   
+.. option:: --mcstf, --no-mcstf
+
+    Enable Motion Compensated Temporal filtering.
+   Default: disabled
 
 Spatial/intra options
 =====================
@@ -1473,17 +1474,9 @@
 
 .. option:: --hist-scenecut, --no-hist-scenecut
 
-   Indicates that scenecuts need to be detected using luma edge and chroma histograms.
-   :option:`--hist-scenecut` enables scenecut detection using the histograms and disables the default scene cut algorithm.
-   :option:`--no-hist-scenecut` disables histogram based scenecut algorithm.
-   
-.. option:: --hist-threshold <0.0..1.0>
-
-   This value represents the threshold for normalized SAD of edge histograms used in scenecut detection.
-   This requires :option:`--hist-scenecut` to be enabled. For example, a value of 0.2 indicates that a frame with normalized SAD value 
-   greater than 0.2 against the previous frame as scenecut. 
-   Increasing the threshold reduces the number of scenecuts detected.
-   Default 0.03.
+   Scenecuts detected based on histogram, intensity and variance of the picture.
+   :option:`--hist-scenecut` enables or :option:`--no-hist-scenecut` disables scenecut detection based on
+   histogram.
    
 .. option:: --radl <integer>
    
@@ -1766,6 +1759,12 @@
    Default 1.0.
    **Range of values:** 0.0 to 3.0
 
+.. option:: --sbrc --no-sbrc
+
+   To enable and disable segment based rate control.Segment duration depends on the
+   keyframe interval specified.If unspecified,default keyframe interval will be used.
+   Default: disabled.
+
 .. option:: --hevc-aq
 
    Enable adaptive quantization
@@ -1976,12 +1975,18 @@
    
    **CLI ONLY**
 
+.. option:: --scenecut-qp-config <filename>
+
+   Specify a text file which contains the scenecut aware QP options.
+   The options include :option:`--scenecut-aware-qp` and :option:`--masking-strength`
+
+   **CLI ONLY**
+
 .. option:: --scenecut-aware-qp <integer>
 
    It reduces the bits spent on the inter-frames within the scenecut window
    before and after a scenecut by increasing their QP in ratecontrol pass2 algorithm
-   without any deterioration in visual quality. If a scenecut falls within the window,
-   the QP of the inter-frames after this scenecut will not be modified.
+   without any deterioration in visual quality.
    :option:`--scenecut-aware-qp` works only with --pass 2. Default 0.
 
    +-------+---------------------------------------------------------------+
@@ -2006,48 +2011,83 @@
    for the QP increment for inter-frames when :option:`--scenecut-aware-qp`
    is enabled.
 
-   When :option:`--scenecut-aware-qp` is::
+   When :option:`--scenecut-aware-qp` is:
+
    * 1 (Forward masking):
-   --masking-strength <fwdWindow,fwdRefQPDelta,fwdNonRefQPDelta>
+   --masking-strength <fwdMaxWindow,fwdRefQPDelta,fwdNonRefQPDelta>
+   or 
+   --masking-strength <fwdWindow1,fwdRefQPDelta1,fwdNonRefQPDelta1,fwdWindow2,fwdRefQPDelta2,fwdNonRefQPDelta2,
+                       fwdWindow3,fwdRefQPDelta3,fwdNonRefQPDelta3,fwdWindow4,fwdRefQPDelta4,fwdNonRefQPDelta4,
+                       fwdWindow5,fwdRefQPDelta5,fwdNonRefQPDelta5,fwdWindow6,fwdRefQPDelta6,fwdNonRefQPDelta6>
    * 2 (Backward masking):
-   --masking-strength <bwdWindow,bwdRefQPDelta,bwdNonRefQPDelta>
+   --masking-strength <bwdMaxWindow,bwdRefQPDelta,bwdNonRefQPDelta>
+   or 
+   --masking-strength <bwdWindow1,bwdRefQPDelta1,bwdNonRefQPDelta1,bwdWindow2,bwdRefQPDelta2,bwdNonRefQPDelta2,
+                       bwdWindow3,bwdRefQPDelta3,bwdNonRefQPDelta3,bwdWindow4,bwdRefQPDelta4,bwdNonRefQPDelta4,
+                       bwdWindow5,bwdRefQPDelta5,bwdNonRefQPDelta5,bwdWindow6,bwdRefQPDelta6,bwdNonRefQPDelta6>
    * 3 (Bi-directional masking):
-   --masking-strength <fwdWindow,fwdRefQPDelta,fwdNonRefQPDelta,bwdWindow,bwdRefQPDelta,bwdNonRefQPDelta>
+   --masking-strength <fwdMaxWindow,fwdRefQPDelta,fwdNonRefQPDelta,bwdMaxWindow,bwdRefQPDelta,bwdNonRefQPDelta>
+   or 
+   --masking-strength <fwdWindow1,fwdRefQPDelta1,fwdNonRefQPDelta1,fwdWindow2,fwdRefQPDelta2,fwdNonRefQPDelta2,
+                       fwdWindow3,fwdRefQPDelta3,fwdNonRefQPDelta3,fwdWindow4,fwdRefQPDelta4,fwdNonRefQPDelta4,
+                       fwdWindow5,fwdRefQPDelta5,fwdNonRefQPDelta5,fwdWindow6,fwdRefQPDelta6,fwdNonRefQPDelta6,
+                       bwdWindow1,bwdRefQPDelta1,bwdNonRefQPDelta1,bwdWindow2,bwdRefQPDelta2,bwdNonRefQPDelta2,
+                       bwdWindow3,bwdRefQPDelta3,bwdNonRefQPDelta3,bwdWindow4,bwdRefQPDelta4,bwdNonRefQPDelta4,
+                       bwdWindow5,bwdRefQPDelta5,bwdNonRefQPDelta5,bwdWindow6,bwdRefQPDelta6,bwdNonRefQPDelta6>
 
    +-----------------+---------------------------------------------------------------+
    | Parameter       | Description                                                   |
    +=================+===============================================================+
-   | fwdWindow       | The duration(in milliseconds) for which there is a reduction  |
-   |                 | in the bits spent on the inter-frames after a scenecut by     |
-   |                 | increasing their QP. Default 500ms.                           |
-   |                 | **Range of values:** 0 to 1000                                |
+   | fwdMaxWindow    | The maximum duration(in milliseconds) for which there is a    |
+   |                 | reduction in the bits spent on the inter-frames after a       |
+   |                 | scenecut by increasing their QP. Default 500ms.               |
+   |                 | **Range of values:** 0 to 2000                                |
+   +-----------------+---------------------------------------------------------------+
+   | fwdWindow       | The duration of a sub-window(in milliseconds) for which there |
+   |                 | is a reduction in the bits spent on the inter-frames after a  |
+   |                 | scenecut by increasing their QP. Default 500ms.               |
+   |                 | **Range of values:** 0 to 2000                                |
    +-----------------+---------------------------------------------------------------+
    | fwdRefQPDelta   | The offset by which QP is incremented for inter-frames        |
    |                 | after a scenecut. Default 5.                                  |
-   |                 | **Range of values:** 0 to 10                                  |
+   |                 | **Range of values:** 0 to 20                                  |
    +-----------------+---------------------------------------------------------------+
    | fwdNonRefQPDelta| The offset by which QP is incremented for non-referenced      |
    |                 | inter-frames after a scenecut. The offset is computed from    |
    |                 | fwdRefQPDelta when it is not explicitly specified.            |
-   |                 | **Range of values:** 0 to 10                                  |
+   |                 | **Range of values:** 0 to 20                                  |
+   +-----------------+---------------------------------------------------------------+
+   | bwdMaxWindow    | The maximum duration(in milliseconds) for which there is a    |
+   |                 | reduction in the bits spent on the inter-frames before a      |
+   |                 | scenecut by increasing their QP. Default 100ms.               |
+   |                 | **Range of values:** 0 to 2000                                |
    +-----------------+---------------------------------------------------------------+
-   | bwdWindow       | The duration(in milliseconds) for which there is a reduction  |
-   |                 | in the bits spent on the inter-frames before a scenecut by    |
-   |                 | increasing their QP. Default 100ms.                           |
-   |                 | **Range of values:** 0 to 1000                                |
+   | bwdWindow       | The duration of a sub-window(in milliseconds) for which there |
​

x265_3.5.tar.gz/doc/reST/introduction.rst -> x265_3.6.tar.gz/doc/reST/introduction.rst Changed

 
@@ -77,6 +77,6 @@
 to start is with the `Motion Picture Experts Group - Licensing Authority
 - HEVC Licensing Program <http://www.mpegla.com/main/PID/HEVC/default.aspx>`_.
 
-x265 is a registered trademark of MulticoreWare, Inc.  The x265 logo is
+x265 is a registered trademark of MulticoreWare, Inc.  The X265 logo is
 a trademark of MulticoreWare, and may only be used with explicit written
 permission.  All rights reserved.
​

x265_3.5.tar.gz/doc/reST/releasenotes.rst -> x265_3.6.tar.gz/doc/reST/releasenotes.rst Changed

@@ -2,6 +2,53 @@
 Release Notes
 *************
 
+Version 3.6
+===========
+
+Release date - 4th April, 2024.
+
+New feature
+-----------
+1. Segment based Ratecontrol (SBRC) feature
+2. Motion-Compensated Spatio-Temporal Filtering
+3. Scene-cut aware qp - BBAQ (Bidirectional Boundary Aware Quantization)
+4. Histogram-Based Scene Change Detection
+5. Film-Grain characteristics as a SEI message to support Film Grain Synthesis(FGS)
+6. Add temporal layer implementation(Hierarchical B-frame implementation)
+ 
+Enhancements to existing features
+---------------------------------
+1. Added Dolby Vision 8.4 Profile Support
+
+
+API changes
+-----------
+1. Add Segment based Ratecontrol(SBRC) feature: "--no-sbrc".
+2. Add command line parameter for mcstf feature: "--no-mctf".
+3. Add command line parameters for the scene cut aware qp feature: "--scenecut-aware-qp" and "--masking-strength".
+4. Add command line parameters for Histogram-Based Scene Change Detection: "--hist-scenecut".
+5. Add film grain characteristics as a SEI message to the bitstream: "--film-grain <filename>"
+6. cli: add new option --cra-nal (Force nal type to CRA to all frames expect for the first frame, works only with keyint 1)
+
+Optimizations
+---------------------
+ARM64 NEON optimizations:- Several time-consuming C functions have been optimized for the targeted platform - aarch64. The overall performance increased by around 20%.
+SVE/SVE2 optimizations
+
+
+Bug fixes
+---------
+1. Linux bug to utilize all the cores
+2. Crash with hist-scenecut build when source resolution is not multiple of minCuSize
+3. 32bit and 64bit builds generation for ARM
+4. bugs in zonefile feature (Reflect Zonefile Parameters inside Lookahead, extra IDR issue, Avg I Slice QP value issue etc..)
+5. Add x86 ASM implementation for subsampling luma 
+6. Fix for abrladder segfault with load reuse level 1 
+7. Reorder miniGOP based on temporal layer hierarchy and add support for more B frame 
+8. Add MacOS aarch64 build support 
+9. Fix boundary condition issue for Gaussian filter
+
+
 Version 3.5
 ===========

 
@@ -2,6 +2,53 @@
 Release Notes
 *************
 
+Version 3.6
+===========
+
+Release date - 4th April, 2024.
+
+New feature
+-----------
+1. Segment based Ratecontrol (SBRC) feature
+2. Motion-Compensated Spatio-Temporal Filtering
+3. Scene-cut aware qp - BBAQ (Bidirectional Boundary Aware Quantization)
+4. Histogram-Based Scene Change Detection
+5. Film-Grain characteristics as a SEI message to support Film Grain Synthesis(FGS)
+6. Add temporal layer implementation(Hierarchical B-frame implementation)
+ 
+Enhancements to existing features
+---------------------------------
+1. Added Dolby Vision 8.4 Profile Support
+
+
+API changes
+-----------
+1. Add Segment based Ratecontrol(SBRC) feature: "--no-sbrc".
+2. Add command line parameter for mcstf feature: "--no-mctf".
+3. Add command line parameters for the scene cut aware qp feature: "--scenecut-aware-qp" and "--masking-strength".
+4. Add command line parameters for Histogram-Based Scene Change Detection: "--hist-scenecut".
+5. Add film grain characteristics as a SEI message to the bitstream: "--film-grain <filename>"
+6. cli: add new option --cra-nal (Force nal type to CRA to all frames expect for the first frame, works only with keyint 1)
+
+Optimizations
+---------------------
+ARM64 NEON optimizations:- Several time-consuming C functions have been optimized for the targeted platform - aarch64. The overall performance increased by around 20%.
+SVE/SVE2 optimizations
+
+
+Bug fixes
+---------
+1. Linux bug to utilize all the cores
+2. Crash with hist-scenecut build when source resolution is not multiple of minCuSize
+3. 32bit and 64bit builds generation for ARM
+4. bugs in zonefile feature (Reflect Zonefile Parameters inside Lookahead, extra IDR issue, Avg I Slice QP value issue etc..)
+5. Add x86 ASM implementation for subsampling luma 
+6. Fix for abrladder segfault with load reuse level 1 
+7. Reorder miniGOP based on temporal layer hierarchy and add support for more B frame 
+8. Add MacOS aarch64 build support 
+9. Fix boundary condition issue for Gaussian filter
+
+
 Version 3.5
 ===========
 
​

x265_3.5.tar.gz/readme.rst -> x265_3.6.tar.gz/readme.rst Changed

 
@@ -2,7 +2,7 @@
 x265 HEVC Encoder
 =================
 
-| **Read:** | Online `documentation <http://x265.readthedocs.org/en/default/>`_ | Developer `wiki <http://bitbucket.org/multicoreware/x265/wiki/>`_
+| **Read:** | Online `documentation <http://x265.readthedocs.org/en/master/>`_ | Developer `wiki <http://bitbucket.org/multicoreware/x265_git/wiki/>`_
 | **Download:** | `releases <http://ftp.videolan.org/pub/videolan/x265/>`_ 
 | **Interact:** | #x265 on freenode.irc.net | `x265-devel@videolan.org <http://mailman.videolan.org/listinfo/x265-devel>`_ | `Report an issue <https://bitbucket.org/multicoreware/x265/issues?status=new&status=open>`_
 
​

x265_3.5.tar.gz/source/CMakeLists.txt -> x265_3.6.tar.gz/source/CMakeLists.txt Changed

@@ -29,7 +29,7 @@
 option(STATIC_LINK_CRT "Statically link C runtime for release builds" OFF)
 mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD)
 # X265_BUILD must be incremented each time the public API is changed
-set(X265_BUILD 199)
+set(X265_BUILD 209)
 configure_file("${PROJECT_SOURCE_DIR}/x265.def.in"
                "${PROJECT_BINARY_DIR}/x265.def")
 configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in"
@@ -38,14 +38,20 @@
 SET(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake" "${CMAKE_MODULE_PATH}")
 
 # System architecture detection
-string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" SYSPROC)
+if (APPLE AND CMAKE_OSX_ARCHITECTURES)
+    string(TOLOWER "${CMAKE_OSX_ARCHITECTURES}" SYSPROC)
+else()
+    string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" SYSPROC)
+endif()
 set(X86_ALIASES x86 i386 i686 x86_64 amd64)
-set(ARM_ALIASES armv6l armv7l aarch64)
+set(ARM_ALIASES armv6l armv7l)
+set(ARM64_ALIASES arm64 arm64e aarch64)
 list(FIND X86_ALIASES "${SYSPROC}" X86MATCH)
 list(FIND ARM_ALIASES "${SYSPROC}" ARMMATCH)
-set(POWER_ALIASES ppc64 ppc64le)
+list(FIND ARM64_ALIASES "${SYSPROC}" ARM64MATCH)
+set(POWER_ALIASES powerpc64 powerpc64le ppc64 ppc64le)
 list(FIND POWER_ALIASES "${SYSPROC}" POWERMATCH)
-if("${SYSPROC}" STREQUAL "" OR X86MATCH GREATER "-1")
+if(X86MATCH GREATER "-1")
     set(X86 1)
     add_definitions(-DX265_ARCH_X86=1)
     if(CMAKE_CXX_FLAGS STREQUAL "-m32")
@@ -70,15 +76,18 @@
     else()
         set(CROSS_COMPILE_ARM 0)
     endif()
+	message(STATUS "Detected ARM target processor")
     set(ARM 1)
-    if("${CMAKE_SIZEOF_VOID_P}" MATCHES 8)
-        message(STATUS "Detected ARM64 target processor")
-        set(ARM64 1)
-        add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=1 -DHAVE_ARMV6=0)
-    else()
-        message(STATUS "Detected ARM target processor")
-        add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=0 -DHAVE_ARMV6=1)
-    endif()
+    add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=1)
+elseif(ARM64MATCH GREATER "-1")
+    #if(CROSS_COMPILE_ARM64)
+        #message(STATUS "Cross compiling for ARM64 arch")
+    #else()
+        #set(CROSS_COMPILE_ARM64 0)
+    #endif()
+    message(STATUS "Detected ARM64 target processor")
+    set(ARM64 1)
+    add_definitions(-DX265_ARCH_ARM64=1 -DHAVE_NEON)
 else()
     message(STATUS "CMAKE_SYSTEM_PROCESSOR value `${CMAKE_SYSTEM_PROCESSOR}` is unknown")
     message(STATUS "Please add this value near ${CMAKE_CURRENT_LIST_FILE}:${CMAKE_CURRENT_LIST_LINE}")
@@ -239,26 +248,43 @@
         endif()
     endif()
     if(ARM AND CROSS_COMPILE_ARM)
-        if(ARM64)
-            set(ARM_ARGS -fPIC)
-        else()
-            set(ARM_ARGS -march=armv6 -mfloat-abi=soft -mfpu=vfp -marm -fPIC)
-        endif()
         message(STATUS "cross compile arm")
+		set(ARM_ARGS -march=armv6 -mfloat-abi=soft -mfpu=vfp -marm -fPIC)
     elseif(ARM)
-        if(ARM64)
-            set(ARM_ARGS -fPIC)
+        find_package(Neon)
+        if(CPU_HAS_NEON)
+            set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=neon -marm -fPIC)
             add_definitions(-DHAVE_NEON)
         else()
-            find_package(Neon)
-            if(CPU_HAS_NEON)
-                set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=neon -marm -fPIC)
-                add_definitions(-DHAVE_NEON)
-            else()
-                set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=vfp -marm)
-            endif()
+            set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=vfp -marm)
         endif()
     endif()
+	if(ARM64 OR CROSS_COMPILE_ARM64)
+        find_package(Neon)
+        find_package(SVE)
+        find_package(SVE2)
+        if(CPU_HAS_SVE2 OR CROSS_COMPILE_SVE2)
+            message(STATUS "Found SVE2")
+	        set(ARM_ARGS -O3 -march=armv8-a+sve2 -fPIC -flax-vector-conversions)
+            add_definitions(-DHAVE_SVE2)
+            add_definitions(-DHAVE_SVE)
+            add_definitions(-DHAVE_NEON) # for NEON c/c++ primitives, as currently there is no implementation that use SVE2
+        elseif(CPU_HAS_SVE OR CROSS_COMPILE_SVE)
+            message(STATUS "Found SVE")
+	        set(ARM_ARGS -O3 -march=armv8-a+sve -fPIC -flax-vector-conversions)
+            add_definitions(-DHAVE_SVE)
+            add_definitions(-DHAVE_NEON) # for NEON c/c++ primitives, as currently there is no implementation that use SVE
+        elseif(CPU_HAS_NEON)
+            message(STATUS "Found NEON")
+            set(ARM_ARGS -fPIC -flax-vector-conversions)
+            add_definitions(-DHAVE_NEON)
+        else()
+            set(ARM_ARGS -fPIC -flax-vector-conversions)
+        endif()        
+    endif()
+	if(ENABLE_PIC)
+	list(APPEND ARM_ARGS -DPIC)
+	endif()
     add_definitions(${ARM_ARGS})
     if(FPROFILE_GENERATE)
         if(INTEL_CXX)
@@ -350,7 +376,7 @@
 endif(GCC)
 
 find_package(Nasm)
-if(ARM OR CROSS_COMPILE_ARM)
+if(ARM OR CROSS_COMPILE_ARM OR ARM64 OR CROSS_COMPILE_ARM64)
     option(ENABLE_ASSEMBLY "Enable use of assembly coded primitives" ON)
 elseif(NASM_FOUND AND X86)
     if (NASM_VERSION_STRING VERSION_LESS "2.13.0")
@@ -384,7 +410,7 @@
 endif(EXTRA_LIB)
 mark_as_advanced(EXTRA_LIB EXTRA_LINK_FLAGS)
 
-if(X64)
+if(X64 OR ARM64 OR PPC64)
     # NOTE: We only officially support high-bit-depth compiles of x265
     # on 64bit architectures. Main10 plus large resolution plus slow
     # preset plus 32bit address space usually means malloc failure.  You
@@ -393,7 +419,7 @@
     # license" so to speak.  If it breaks you get to keep both halves.
     # You will need to disable assembly manually.
     option(HIGH_BIT_DEPTH "Store pixel samples as 16bit values (Main10/Main12)" OFF)
-endif(X64)
+endif(X64 OR ARM64 OR PPC64)
 if(HIGH_BIT_DEPTH)
     option(MAIN12 "Support Main12 instead of Main10" OFF)
     if(MAIN12)
@@ -440,6 +466,18 @@
 endif()
 add_definitions(-DX265_NS=${X265_NS})
 
+if(ARM64)
+  if(HIGH_BIT_DEPTH)
+    if(MAIN12)
+      list(APPEND ASM_FLAGS -DHIGH_BIT_DEPTH=1 -DBIT_DEPTH=12 -DX265_NS=${X265_NS})
+    else()
+      list(APPEND ASM_FLAGS -DHIGH_BIT_DEPTH=1 -DBIT_DEPTH=10 -DX265_NS=${X265_NS})
+    endif()
+  else()
+    list(APPEND ASM_FLAGS -DHIGH_BIT_DEPTH=0 -DBIT_DEPTH=8 -DX265_NS=${X265_NS})
+  endif()
+endif(ARM64)
+
 option(WARNINGS_AS_ERRORS "Stop compiles on first warning" OFF)
 if(WARNINGS_AS_ERRORS)
     if(GCC)
@@ -536,11 +574,7 @@
     # compile ARM arch asm files here
         enable_language(ASM)
         foreach(ASM ${ARM_ASMS})
-            if(ARM64)
-                set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/aarch64/${ASM})
-            else()
-                set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/arm/${ASM})
-            endif()
+			set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/arm/${ASM})
             list(APPEND ASM_SRCS ${ASM_SRC})
             list(APPEND ASM_OBJS ${ASM}.${SUFFIX})
             add_custom_command(
@@ -549,6 +583,52 @@
                 ARGS ${ARM_ARGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}
                 DEPENDS ${ASM_SRC})
         endforeach()
+	elseif(ARM64 OR CROSS_COMPILE_ARM64)
+    # compile ARM64 arch asm files here
+        enable_language(ASM)
+        foreach(ASM ${ARM_ASMS})
+            set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/aarch64/${ASM})
+            list(APPEND ASM_SRCS ${ASM_SRC})
+            list(APPEND ASM_OBJS ${ASM}.${SUFFIX})
+            add_custom_command(
+                OUTPUT ${ASM}.${SUFFIX}
+                COMMAND ${CMAKE_CXX_COMPILER}
+                ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}
+                DEPENDS ${ASM_SRC})
+        endforeach()
+        if(CPU_HAS_SVE2 OR CROSS_COMPILE_SVE2)
+            foreach(ASM ${ARM_ASMS_SVE})
+                set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/aarch64/${ASM})
+                list(APPEND ASM_SRCS ${ASM_SRC})
+                list(APPEND ASM_OBJS ${ASM}.${SUFFIX})

 
@@ -29,7 +29,7 @@
 option(STATIC_LINK_CRT "Statically link C runtime for release builds" OFF)
 mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD)
 # X265_BUILD must be incremented each time the public API is changed
-set(X265_BUILD 199)
+set(X265_BUILD 209)
 configure_file("${PROJECT_SOURCE_DIR}/x265.def.in"
                "${PROJECT_BINARY_DIR}/x265.def")
 configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in"
@@ -38,14 +38,20 @@
 SET(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake" "${CMAKE_MODULE_PATH}")
 
 # System architecture detection
-string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" SYSPROC)
+if (APPLE AND CMAKE_OSX_ARCHITECTURES)
+    string(TOLOWER "${CMAKE_OSX_ARCHITECTURES}" SYSPROC)
+else()
+    string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" SYSPROC)
+endif()
 set(X86_ALIASES x86 i386 i686 x86_64 amd64)
-set(ARM_ALIASES armv6l armv7l aarch64)
+set(ARM_ALIASES armv6l armv7l)
+set(ARM64_ALIASES arm64 arm64e aarch64)
 list(FIND X86_ALIASES "${SYSPROC}" X86MATCH)
 list(FIND ARM_ALIASES "${SYSPROC}" ARMMATCH)
-set(POWER_ALIASES ppc64 ppc64le)
+list(FIND ARM64_ALIASES "${SYSPROC}" ARM64MATCH)
+set(POWER_ALIASES powerpc64 powerpc64le ppc64 ppc64le)
 list(FIND POWER_ALIASES "${SYSPROC}" POWERMATCH)
-if("${SYSPROC}" STREQUAL "" OR X86MATCH GREATER "-1")
+if(X86MATCH GREATER "-1")
     set(X86 1)
     add_definitions(-DX265_ARCH_X86=1)
     if(CMAKE_CXX_FLAGS STREQUAL "-m32")
@@ -70,15 +76,18 @@
     else()
         set(CROSS_COMPILE_ARM 0)
     endif()
+   message(STATUS "Detected ARM target processor")
     set(ARM 1)
-    if("${CMAKE_SIZEOF_VOID_P}" MATCHES 8)
-        message(STATUS "Detected ARM64 target processor")
-        set(ARM64 1)
-        add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=1 -DHAVE_ARMV6=0)
-    else()
-        message(STATUS "Detected ARM target processor")
-        add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=0 -DHAVE_ARMV6=1)
-    endif()
+    add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=1)
+elseif(ARM64MATCH GREATER "-1")
+    #if(CROSS_COMPILE_ARM64)
+        #message(STATUS "Cross compiling for ARM64 arch")
+    #else()
+        #set(CROSS_COMPILE_ARM64 0)
+    #endif()
+    message(STATUS "Detected ARM64 target processor")
+    set(ARM64 1)
+    add_definitions(-DX265_ARCH_ARM64=1 -DHAVE_NEON)
 else()
     message(STATUS "CMAKE_SYSTEM_PROCESSOR value `${CMAKE_SYSTEM_PROCESSOR}` is unknown")
     message(STATUS "Please add this value near ${CMAKE_CURRENT_LIST_FILE}:${CMAKE_CURRENT_LIST_LINE}")
@@ -239,26 +248,43 @@
         endif()
     endif()
     if(ARM AND CROSS_COMPILE_ARM)
-        if(ARM64)
-            set(ARM_ARGS -fPIC)
-        else()
-            set(ARM_ARGS -march=armv6 -mfloat-abi=soft -mfpu=vfp -marm -fPIC)
-        endif()
         message(STATUS "cross compile arm")
+       set(ARM_ARGS -march=armv6 -mfloat-abi=soft -mfpu=vfp -marm -fPIC)
     elseif(ARM)
-        if(ARM64)
-            set(ARM_ARGS -fPIC)
+        find_package(Neon)
+        if(CPU_HAS_NEON)
+            set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=neon -marm -fPIC)
             add_definitions(-DHAVE_NEON)
         else()
-            find_package(Neon)
-            if(CPU_HAS_NEON)
-                set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=neon -marm -fPIC)
-                add_definitions(-DHAVE_NEON)
-            else()
-                set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=vfp -marm)
-            endif()
+            set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=vfp -marm)
         endif()
     endif()
+   if(ARM64 OR CROSS_COMPILE_ARM64)
+        find_package(Neon)
+        find_package(SVE)
+        find_package(SVE2)
+        if(CPU_HAS_SVE2 OR CROSS_COMPILE_SVE2)
+            message(STATUS "Found SVE2")
+           set(ARM_ARGS -O3 -march=armv8-a+sve2 -fPIC -flax-vector-conversions)
+            add_definitions(-DHAVE_SVE2)
+            add_definitions(-DHAVE_SVE)
+            add_definitions(-DHAVE_NEON) # for NEON c/c++ primitives, as currently there is no implementation that use SVE2
+        elseif(CPU_HAS_SVE OR CROSS_COMPILE_SVE)
+            message(STATUS "Found SVE")
+           set(ARM_ARGS -O3 -march=armv8-a+sve -fPIC -flax-vector-conversions)
+            add_definitions(-DHAVE_SVE)
+            add_definitions(-DHAVE_NEON) # for NEON c/c++ primitives, as currently there is no implementation that use SVE
+        elseif(CPU_HAS_NEON)
+            message(STATUS "Found NEON")
+            set(ARM_ARGS -fPIC -flax-vector-conversions)
+            add_definitions(-DHAVE_NEON)
+        else()
+            set(ARM_ARGS -fPIC -flax-vector-conversions)
+        endif()        
+    endif()
+   if(ENABLE_PIC)
+   list(APPEND ARM_ARGS -DPIC)
+   endif()
     add_definitions(${ARM_ARGS})
     if(FPROFILE_GENERATE)
         if(INTEL_CXX)
@@ -350,7 +376,7 @@
 endif(GCC)
 
 find_package(Nasm)
-if(ARM OR CROSS_COMPILE_ARM)
+if(ARM OR CROSS_COMPILE_ARM OR ARM64 OR CROSS_COMPILE_ARM64)
     option(ENABLE_ASSEMBLY "Enable use of assembly coded primitives" ON)
 elseif(NASM_FOUND AND X86)
     if (NASM_VERSION_STRING VERSION_LESS "2.13.0")
@@ -384,7 +410,7 @@
 endif(EXTRA_LIB)
 mark_as_advanced(EXTRA_LIB EXTRA_LINK_FLAGS)
 
-if(X64)
+if(X64 OR ARM64 OR PPC64)
     # NOTE: We only officially support high-bit-depth compiles of x265
     # on 64bit architectures. Main10 plus large resolution plus slow
     # preset plus 32bit address space usually means malloc failure.  You
@@ -393,7 +419,7 @@
     # license" so to speak.  If it breaks you get to keep both halves.
     # You will need to disable assembly manually.
     option(HIGH_BIT_DEPTH "Store pixel samples as 16bit values (Main10/Main12)" OFF)
-endif(X64)
+endif(X64 OR ARM64 OR PPC64)
 if(HIGH_BIT_DEPTH)
     option(MAIN12 "Support Main12 instead of Main10" OFF)
     if(MAIN12)
@@ -440,6 +466,18 @@
 endif()
 add_definitions(-DX265_NS=${X265_NS})
 
+if(ARM64)
+  if(HIGH_BIT_DEPTH)
+    if(MAIN12)
+      list(APPEND ASM_FLAGS -DHIGH_BIT_DEPTH=1 -DBIT_DEPTH=12 -DX265_NS=${X265_NS})
+    else()
+      list(APPEND ASM_FLAGS -DHIGH_BIT_DEPTH=1 -DBIT_DEPTH=10 -DX265_NS=${X265_NS})
+    endif()
+  else()
+    list(APPEND ASM_FLAGS -DHIGH_BIT_DEPTH=0 -DBIT_DEPTH=8 -DX265_NS=${X265_NS})
+  endif()
+endif(ARM64)
+
 option(WARNINGS_AS_ERRORS "Stop compiles on first warning" OFF)
 if(WARNINGS_AS_ERRORS)
     if(GCC)
@@ -536,11 +574,7 @@
     # compile ARM arch asm files here
         enable_language(ASM)
         foreach(ASM ${ARM_ASMS})
-            if(ARM64)
-                set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/aarch64/${ASM})
-            else()
-                set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/arm/${ASM})
-            endif()
+           set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/arm/${ASM})
             list(APPEND ASM_SRCS ${ASM_SRC})
             list(APPEND ASM_OBJS ${ASM}.${SUFFIX})
             add_custom_command(
@@ -549,6 +583,52 @@
                 ARGS ${ARM_ARGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}
                 DEPENDS ${ASM_SRC})
         endforeach()
+   elseif(ARM64 OR CROSS_COMPILE_ARM64)
+    # compile ARM64 arch asm files here
+        enable_language(ASM)
+        foreach(ASM ${ARM_ASMS})
+            set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/aarch64/${ASM})
+            list(APPEND ASM_SRCS ${ASM_SRC})
+            list(APPEND ASM_OBJS ${ASM}.${SUFFIX})
+            add_custom_command(
+                OUTPUT ${ASM}.${SUFFIX}
+                COMMAND ${CMAKE_CXX_COMPILER}
+                ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}
+                DEPENDS ${ASM_SRC})
+        endforeach()
+        if(CPU_HAS_SVE2 OR CROSS_COMPILE_SVE2)
+            foreach(ASM ${ARM_ASMS_SVE})
+                set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/aarch64/${ASM})
+                list(APPEND ASM_SRCS ${ASM_SRC})
+                list(APPEND ASM_OBJS ${ASM}.${SUFFIX})
​

x265_3.5.tar.gz/source/abrEncApp.cpp -> x265_3.6.tar.gz/source/abrEncApp.cpp Changed

@@ -1,1111 +1,1111 @@
-/*****************************************************************************
-* Copyright (C) 2013-2020 MulticoreWare, Inc
-*
-* Authors: Pooja Venkatesan <pooja@multicorewareinc.com>
-*          Aruna Matheswaran <aruna@multicorewareinc.com>
-*
-* This program is free software; you can redistribute it and/or modify
-* it under the terms of the GNU General Public License as published by
-* the Free Software Foundation; either version 2 of the License, or
-* (at your option) any later version.
-*
-* This program is distributed in the hope that it will be useful,
-* but WITHOUT ANY WARRANTY; without even the implied warranty of
-* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-* GNU General Public License for more details.
-*
-* You should have received a copy of the GNU General Public License
-* along with this program; if not, write to the Free Software
-* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
-*
-* This program is also available under a commercial proprietary license.
-* For more information, contact us at license @ x265.com.
-*****************************************************************************/
-
-#include "abrEncApp.h"
-#include "mv.h"
-#include "slice.h"
-#include "param.h"
-
-#include <signal.h>
-#include <errno.h>
-
-#include <queue>
-
-using namespace X265_NS;
-
-/* Ctrl-C handler */
-static volatile sig_atomic_t b_ctrl_c /* = 0 */;
-static void sigint_handler(int)
-{
-    b_ctrl_c = 1;
-}
-
-namespace X265_NS {
-    // private namespace
-#define X265_INPUT_QUEUE_SIZE 250
-
-    AbrEncoder::AbrEncoder(CLIOptions cliopt, uint8_t numEncodes, int &ret)
-    {
-        m_numEncodes = numEncodes;
-        m_numActiveEncodes.set(numEncodes);
-        m_queueSize = (numEncodes > 1) ? X265_INPUT_QUEUE_SIZE : 1;
-        m_passEnc = X265_MALLOC(PassEncoder*, m_numEncodes);
-
-        for (uint8_t i = 0; i < m_numEncodes; i++)
-        {
-            m_passEnci = new PassEncoder(i, cliopti, this);
-            if (!m_passEnci)
-            {
-                x265_log(NULL, X265_LOG_ERROR, "Unable to allocate memory for passEncoder\n");
-                ret = 4;
-            }
-            m_passEnci->init(ret);
-        }
-
-        if (!allocBuffers())
-        {
-            x265_log(NULL, X265_LOG_ERROR, "Unable to allocate memory for buffers\n");
-            ret = 4;
-        }
-
-        /* start passEncoder worker threads */
-        for (uint8_t pass = 0; pass < m_numEncodes; pass++)
-            m_passEncpass->startThreads();
-    }
-
-    bool AbrEncoder::allocBuffers()
-    {
-        m_inputPicBuffer = X265_MALLOC(x265_picture**, m_numEncodes);
-        m_analysisBuffer = X265_MALLOC(x265_analysis_data*, m_numEncodes);
-
-        m_picWriteCnt = new ThreadSafeIntegerm_numEncodes;
-        m_picReadCnt = new ThreadSafeIntegerm_numEncodes;
-        m_analysisWriteCnt = new ThreadSafeIntegerm_numEncodes;
-        m_analysisReadCnt = new ThreadSafeIntegerm_numEncodes;
-
-        m_picIdxReadCnt = X265_MALLOC(ThreadSafeInteger*, m_numEncodes);
-        m_analysisWrite = X265_MALLOC(ThreadSafeInteger*, m_numEncodes);
-        m_analysisRead = X265_MALLOC(ThreadSafeInteger*, m_numEncodes);
-        m_readFlag = X265_MALLOC(int*, m_numEncodes);
-
-        for (uint8_t pass = 0; pass < m_numEncodes; pass++)
-        {
-            m_inputPicBufferpass = X265_MALLOC(x265_picture*, m_queueSize);
-            for (uint32_t idx = 0; idx < m_queueSize; idx++)
-            {
-                m_inputPicBufferpassidx = x265_picture_alloc();
-                x265_picture_init(m_passEncpass->m_param, m_inputPicBufferpassidx);
-            }
-
-            CHECKED_MALLOC_ZERO(m_analysisBufferpass, x265_analysis_data, m_queueSize);
-            m_picIdxReadCntpass = new ThreadSafeIntegerm_queueSize;
-            m_analysisWritepass = new ThreadSafeIntegerm_queueSize;
-            m_analysisReadpass = new ThreadSafeIntegerm_queueSize;
-            m_readFlagpass = X265_MALLOC(int, m_queueSize);
-        }
-        return true;
-    fail:
-        return false;
-    }
-
-    void AbrEncoder::destroy()
-    {
-        x265_cleanup(); /* Free library singletons */
-        for (uint8_t pass = 0; pass < m_numEncodes; pass++)
-        {
-            for (uint32_t index = 0; index < m_queueSize; index++)
-            {
-                X265_FREE(m_inputPicBufferpassindex->planes0);
-                x265_picture_free(m_inputPicBufferpassindex);
-            }
-
-            X265_FREE(m_inputPicBufferpass);
-            X265_FREE(m_analysisBufferpass);
-            X265_FREE(m_readFlagpass);
-            delete m_picIdxReadCntpass;
-            delete m_analysisWritepass;
-            delete m_analysisReadpass;
-            m_passEncpass->destroy();
-            delete m_passEncpass;
-        }
-        X265_FREE(m_inputPicBuffer);
-        X265_FREE(m_analysisBuffer);
-        X265_FREE(m_readFlag);
-
-        delete m_picWriteCnt;
-        delete m_picReadCnt;
-        delete m_analysisWriteCnt;
-        delete m_analysisReadCnt;
-
-        X265_FREE(m_picIdxReadCnt);
-        X265_FREE(m_analysisWrite);
-        X265_FREE(m_analysisRead);
-
-        X265_FREE(m_passEnc);
-    }
-
-    PassEncoder::PassEncoder(uint32_t id, CLIOptions cliopt, AbrEncoder *parent)
-    {
-        m_id = id;
-        m_cliopt = cliopt;
-        m_parent = parent;
-        if(!(m_cliopt.enableScaler && m_id))
-            m_input = m_cliopt.input;
-        m_param = cliopt.param;
-        m_inputOver = false;
-        m_lastIdx = -1;
-        m_encoder = NULL;
-        m_scaler = NULL;
-        m_reader = NULL;
-        m_ret = 0;
-    }
-
-    int PassEncoder::init(int &result)
-    {
-        if (m_parent->m_numEncodes > 1)
-            setReuseLevel();
-                
-        if (!(m_cliopt.enableScaler && m_id))
-            m_reader = new Reader(m_id, this);
-        else
-        {
-            VideoDesc *src = NULL, *dst = NULL;
-            dst = new VideoDesc(m_param->sourceWidth, m_param->sourceHeight, m_param->internalCsp, m_param->internalBitDepth);
-            int dstW = m_parent->m_passEncm_id - 1->m_param->sourceWidth;
-            int dstH = m_parent->m_passEncm_id - 1->m_param->sourceHeight;
-            src = new VideoDesc(dstW, dstH, m_param->internalCsp, m_param->internalBitDepth);
-            if (src != NULL && dst != NULL)
-            {
-                m_scaler = new Scaler(0, 1, m_id, src, dst, this);
-                if (!m_scaler)
-                {
-                    x265_log(m_param, X265_LOG_ERROR, "\n MALLOC failure in Scaler");
-                    result = 4;
-                }
-            }
-        }
-
-        /* note: we could try to acquire a different libx265 API here based on
-        * the profile found during option parsing, but it must be done before
-        * opening an encoder */
-
-        if (m_param)
-            m_encoder = m_cliopt.api->encoder_open(m_param);
-        if (!m_encoder)
-        {
-            x265_log(NULL, X265_LOG_ERROR, "x265_encoder_open() failed for Enc, \n");
-            m_ret = 2;
-            return -1;

 
@@ -1,1111 +1,1111 @@
-/*****************************************************************************
-* Copyright (C) 2013-2020 MulticoreWare, Inc
-*
-* Authors: Pooja Venkatesan <pooja@multicorewareinc.com>
-*          Aruna Matheswaran <aruna@multicorewareinc.com>
-*
-* This program is free software; you can redistribute it and/or modify
-* it under the terms of the GNU General Public License as published by
-* the Free Software Foundation; either version 2 of the License, or
-* (at your option) any later version.
-*
-* This program is distributed in the hope that it will be useful,
-* but WITHOUT ANY WARRANTY; without even the implied warranty of
-* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-* GNU General Public License for more details.
-*
-* You should have received a copy of the GNU General Public License
-* along with this program; if not, write to the Free Software
-* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
-*
-* This program is also available under a commercial proprietary license.
-* For more information, contact us at license @ x265.com.
-*****************************************************************************/
-
-#include "abrEncApp.h"
-#include "mv.h"
-#include "slice.h"
-#include "param.h"
-
-#include <signal.h>
-#include <errno.h>
-
-#include <queue>
-
-using namespace X265_NS;
-
-/* Ctrl-C handler */
-static volatile sig_atomic_t b_ctrl_c /* = 0 */;
-static void sigint_handler(int)
-{
-    b_ctrl_c = 1;
-}
-
-namespace X265_NS {
-    // private namespace
-#define X265_INPUT_QUEUE_SIZE 250
-
-    AbrEncoder::AbrEncoder(CLIOptions cliopt, uint8_t numEncodes, int &ret)
-    {
-        m_numEncodes = numEncodes;
-        m_numActiveEncodes.set(numEncodes);
-        m_queueSize = (numEncodes > 1) ? X265_INPUT_QUEUE_SIZE : 1;
-        m_passEnc = X265_MALLOC(PassEncoder*, m_numEncodes);
-
-        for (uint8_t i = 0; i < m_numEncodes; i++)
-        {
-            m_passEnci = new PassEncoder(i, cliopti, this);
-            if (!m_passEnci)
-            {
-                x265_log(NULL, X265_LOG_ERROR, "Unable to allocate memory for passEncoder\n");
-                ret = 4;
-            }
-            m_passEnci->init(ret);
-        }
-
-        if (!allocBuffers())
-        {
-            x265_log(NULL, X265_LOG_ERROR, "Unable to allocate memory for buffers\n");
-            ret = 4;
-        }
-
-        /* start passEncoder worker threads */
-        for (uint8_t pass = 0; pass < m_numEncodes; pass++)
-            m_passEncpass->startThreads();
-    }
-
-    bool AbrEncoder::allocBuffers()
-    {
-        m_inputPicBuffer = X265_MALLOC(x265_picture**, m_numEncodes);
-        m_analysisBuffer = X265_MALLOC(x265_analysis_data*, m_numEncodes);
-
-        m_picWriteCnt = new ThreadSafeIntegerm_numEncodes;
-        m_picReadCnt = new ThreadSafeIntegerm_numEncodes;
-        m_analysisWriteCnt = new ThreadSafeIntegerm_numEncodes;
-        m_analysisReadCnt = new ThreadSafeIntegerm_numEncodes;
-
-        m_picIdxReadCnt = X265_MALLOC(ThreadSafeInteger*, m_numEncodes);
-        m_analysisWrite = X265_MALLOC(ThreadSafeInteger*, m_numEncodes);
-        m_analysisRead = X265_MALLOC(ThreadSafeInteger*, m_numEncodes);
-        m_readFlag = X265_MALLOC(int*, m_numEncodes);
-
-        for (uint8_t pass = 0; pass < m_numEncodes; pass++)
-        {
-            m_inputPicBufferpass = X265_MALLOC(x265_picture*, m_queueSize);
-            for (uint32_t idx = 0; idx < m_queueSize; idx++)
-            {
-                m_inputPicBufferpassidx = x265_picture_alloc();
-                x265_picture_init(m_passEncpass->m_param, m_inputPicBufferpassidx);
-            }
-
-            CHECKED_MALLOC_ZERO(m_analysisBufferpass, x265_analysis_data, m_queueSize);
-            m_picIdxReadCntpass = new ThreadSafeIntegerm_queueSize;
-            m_analysisWritepass = new ThreadSafeIntegerm_queueSize;
-            m_analysisReadpass = new ThreadSafeIntegerm_queueSize;
-            m_readFlagpass = X265_MALLOC(int, m_queueSize);
-        }
-        return true;
-    fail:
-        return false;
-    }
-
-    void AbrEncoder::destroy()
-    {
-        x265_cleanup(); /* Free library singletons */
-        for (uint8_t pass = 0; pass < m_numEncodes; pass++)
-        {
-            for (uint32_t index = 0; index < m_queueSize; index++)
-            {
-                X265_FREE(m_inputPicBufferpassindex->planes0);
-                x265_picture_free(m_inputPicBufferpassindex);
-            }
-
-            X265_FREE(m_inputPicBufferpass);
-            X265_FREE(m_analysisBufferpass);
-            X265_FREE(m_readFlagpass);
-            delete m_picIdxReadCntpass;
-            delete m_analysisWritepass;
-            delete m_analysisReadpass;
-            m_passEncpass->destroy();
-            delete m_passEncpass;
-        }
-        X265_FREE(m_inputPicBuffer);
-        X265_FREE(m_analysisBuffer);
-        X265_FREE(m_readFlag);
-
-        delete m_picWriteCnt;
-        delete m_picReadCnt;
-        delete m_analysisWriteCnt;
-        delete m_analysisReadCnt;
-
-        X265_FREE(m_picIdxReadCnt);
-        X265_FREE(m_analysisWrite);
-        X265_FREE(m_analysisRead);
-
-        X265_FREE(m_passEnc);
-    }
-
-    PassEncoder::PassEncoder(uint32_t id, CLIOptions cliopt, AbrEncoder *parent)
-    {
-        m_id = id;
-        m_cliopt = cliopt;
-        m_parent = parent;
-        if(!(m_cliopt.enableScaler && m_id))
-            m_input = m_cliopt.input;
-        m_param = cliopt.param;
-        m_inputOver = false;
-        m_lastIdx = -1;
-        m_encoder = NULL;
-        m_scaler = NULL;
-        m_reader = NULL;
-        m_ret = 0;
-    }
-
-    int PassEncoder::init(int &result)
-    {
-        if (m_parent->m_numEncodes > 1)
-            setReuseLevel();
-                
-        if (!(m_cliopt.enableScaler && m_id))
-            m_reader = new Reader(m_id, this);
-        else
-        {
-            VideoDesc *src = NULL, *dst = NULL;
-            dst = new VideoDesc(m_param->sourceWidth, m_param->sourceHeight, m_param->internalCsp, m_param->internalBitDepth);
-            int dstW = m_parent->m_passEncm_id - 1->m_param->sourceWidth;
-            int dstH = m_parent->m_passEncm_id - 1->m_param->sourceHeight;
-            src = new VideoDesc(dstW, dstH, m_param->internalCsp, m_param->internalBitDepth);
-            if (src != NULL && dst != NULL)
-            {
-                m_scaler = new Scaler(0, 1, m_id, src, dst, this);
-                if (!m_scaler)
-                {
-                    x265_log(m_param, X265_LOG_ERROR, "\n MALLOC failure in Scaler");
-                    result = 4;
-                }
-            }
-        }
-
-        /* note: we could try to acquire a different libx265 API here based on
-        * the profile found during option parsing, but it must be done before
-        * opening an encoder */
-
-        if (m_param)
-            m_encoder = m_cliopt.api->encoder_open(m_param);
-        if (!m_encoder)
-        {
-            x265_log(NULL, X265_LOG_ERROR, "x265_encoder_open() failed for Enc, \n");
-            m_ret = 2;
-            return -1;
​

x265_3.5.tar.gz/source/abrEncApp.h -> x265_3.6.tar.gz/source/abrEncApp.h Changed

 
@@ -91,6 +91,7 @@
         FILE*    m_qpfile;
         FILE*    m_zoneFile;
         FILE*    m_dolbyVisionRpu;/* File containing Dolby Vision BL RPU metadata */
+        FILE*    m_scenecutAwareQpConfig;
 
         int m_ret;
 
​

x265_3.5.tar.gz/source/cmake/FindNeon.cmake -> x265_3.6.tar.gz/source/cmake/FindNeon.cmake Changed

 
@@ -1,10 +1,21 @@
 include(FindPackageHandleStandardArgs)
 
 # Check the version of neon supported by the ARM CPU
-execute_process(COMMAND cat /proc/cpuinfo | grep Features | grep neon
-                OUTPUT_VARIABLE neon_version
-                ERROR_QUIET
-                OUTPUT_STRIP_TRAILING_WHITESPACE)
+if(APPLE)
+    execute_process(COMMAND sysctl -a
+                    COMMAND grep "hw.optional.neon: 1"
+                    OUTPUT_VARIABLE neon_version
+                    ERROR_QUIET
+                    OUTPUT_STRIP_TRAILING_WHITESPACE)
+else()
+    execute_process(COMMAND cat /proc/cpuinfo
+                    COMMAND grep Features
+                    COMMAND grep neon
+                    OUTPUT_VARIABLE neon_version
+                    ERROR_QUIET
+                    OUTPUT_STRIP_TRAILING_WHITESPACE)
+endif()
+
 if(neon_version)
     set(CPU_HAS_NEON 1)
 endif()
​

x265_3.6.tar.gz/source/cmake/FindSVE.cmake Added

 
@@ -0,0 +1,21 @@
+include(FindPackageHandleStandardArgs)
+
+# Check the version of SVE supported by the ARM CPU
+if(APPLE)
+    execute_process(COMMAND sysctl -a
+                    COMMAND grep "hw.optional.sve: 1"
+                    OUTPUT_VARIABLE sve_version
+                    ERROR_QUIET
+                    OUTPUT_STRIP_TRAILING_WHITESPACE)
+else()
+    execute_process(COMMAND cat /proc/cpuinfo
+                    COMMAND grep Features
+                    COMMAND grep -e "sve$" -e "sve:space:"
+                    OUTPUT_VARIABLE sve_version
+                    ERROR_QUIET
+                    OUTPUT_STRIP_TRAILING_WHITESPACE)
+endif()
+
+if(sve_version)
+    set(CPU_HAS_SVE 1)
+endif()
​

x265_3.6.tar.gz/source/cmake/FindSVE2.cmake Added

 
@@ -0,0 +1,22 @@
+include(FindPackageHandleStandardArgs)
+
+# Check the version of SVE2 supported by the ARM CPU
+if(APPLE)
+    execute_process(COMMAND sysctl -a
+                    COMMAND grep "hw.optional.sve2: 1"
+                    OUTPUT_VARIABLE sve2_version
+                    ERROR_QUIET
+                    OUTPUT_STRIP_TRAILING_WHITESPACE)
+else()
+    execute_process(COMMAND cat /proc/cpuinfo
+                    COMMAND grep Features
+                    COMMAND grep sve2
+                    OUTPUT_VARIABLE sve2_version
+                    ERROR_QUIET
+                    OUTPUT_STRIP_TRAILING_WHITESPACE)
+endif()
+
+if(sve2_version)
+    set(CPU_HAS_SVE 1)
+    set(CPU_HAS_SVE2 1)
+endif()
​

x265_3.5.tar.gz/source/common/CMakeLists.txt -> x265_3.6.tar.gz/source/common/CMakeLists.txt Changed

@@ -84,35 +84,42 @@
 endif(ENABLE_ASSEMBLY AND X86)
 
 if(ENABLE_ASSEMBLY AND (ARM OR CROSS_COMPILE_ARM))
-    if(ARM64)
-        if(GCC AND (CMAKE_CXX_FLAGS_RELEASE MATCHES "-O3"))
-            message(STATUS "Detected CXX compiler using -O3 optimization level")
-            add_definitions(-DAUTO_VECTORIZE=1)
-        endif()
-        set(C_SRCS asm-primitives.cpp pixel.h ipfilter8.h)
-
-        # add ARM assembly/intrinsic files here
-        set(A_SRCS asm.S mc-a.S sad-a.S pixel-util.S ipfilter8.S)
-        set(VEC_PRIMITIVES)
+    set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h loopfilter.h)
 
-        set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
-        foreach(SRC ${C_SRCS})
-            set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
-        endforeach()
-    else()
-        set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h loopfilter.h)
+    # add ARM assembly/intrinsic files here
+    set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S ssd-a.S blockcopy8.S ipfilter8.S dct-a.S)
+    set(VEC_PRIMITIVES)
 
-        # add ARM assembly/intrinsic files here
-        set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S ssd-a.S blockcopy8.S ipfilter8.S dct-a.S)
-        set(VEC_PRIMITIVES)
+    set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
+    foreach(SRC ${C_SRCS})
+        set(ASM_PRIMITIVES ${ASM_PRIMITIVES} arm/${SRC})
+    endforeach()
+    source_group(Assembly FILES ${ASM_PRIMITIVES})
+endif(ENABLE_ASSEMBLY AND (ARM OR CROSS_COMPILE_ARM))
 
-        set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
-        foreach(SRC ${C_SRCS})
-            set(ASM_PRIMITIVES ${ASM_PRIMITIVES} arm/${SRC})
-        endforeach()
+if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64))
+    if(GCC AND (CMAKE_CXX_FLAGS_RELEASE MATCHES "-O3"))
+        message(STATUS "Detected CXX compiler using -O3 optimization level")
+        add_definitions(-DAUTO_VECTORIZE=1)
     endif()
+
+    set(C_SRCS asm-primitives.cpp pixel-prim.h pixel-prim.cpp filter-prim.h filter-prim.cpp dct-prim.h dct-prim.cpp loopfilter-prim.cpp loopfilter-prim.h intrapred-prim.cpp arm64-utils.cpp arm64-utils.h fun-decls.h)
+    enable_language(ASM)
+
+    # add ARM assembly/intrinsic files here
+    set(A_SRCS asm.S mc-a.S mc-a-common.S sad-a.S sad-a-common.S pixel-util.S pixel-util-common.S p2s.S p2s-common.S ipfilter.S ipfilter-common.S blockcopy8.S blockcopy8-common.S ssd-a.S ssd-a-common.S)
+    set(A_SRCS_SVE asm-sve.S blockcopy8-sve.S p2s-sve.S pixel-util-sve.S ssd-a-sve.S)
+    set(A_SRCS_SVE2 mc-a-sve2.S sad-a-sve2.S pixel-util-sve2.S ipfilter-sve2.S ssd-a-sve2.S)
+    set(VEC_PRIMITIVES)
+
+    set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
+    set(ARM_ASMS_SVE "${A_SRCS_SVE}" CACHE INTERNAL "ARM Assembly Sources that use SVE instruction set")
+    set(ARM_ASMS_SVE2 "${A_SRCS_SVE2}" CACHE INTERNAL "ARM Assembly Sources that use SVE2 instruction set")
+    foreach(SRC ${C_SRCS})
+        set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
+    endforeach()
     source_group(Assembly FILES ${ASM_PRIMITIVES})
-endif(ENABLE_ASSEMBLY AND (ARM OR CROSS_COMPILE_ARM))
+endif(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64))
 
 if(POWER)
     set_source_files_properties(version.cpp PROPERTIES COMPILE_FLAGS -DX265_VERSION=${X265_VERSION})
@@ -169,4 +176,6 @@
     scalinglist.cpp scalinglist.h
     quant.cpp quant.h contexts.h
     deblock.cpp deblock.h
-    scaler.cpp scaler.h)
+    scaler.cpp scaler.h
+    ringmem.cpp ringmem.h
+    temporalfilter.cpp temporalfilter.h)

 
@@ -84,35 +84,42 @@
 endif(ENABLE_ASSEMBLY AND X86)
 
 if(ENABLE_ASSEMBLY AND (ARM OR CROSS_COMPILE_ARM))
-    if(ARM64)
-        if(GCC AND (CMAKE_CXX_FLAGS_RELEASE MATCHES "-O3"))
-            message(STATUS "Detected CXX compiler using -O3 optimization level")
-            add_definitions(-DAUTO_VECTORIZE=1)
-        endif()
-        set(C_SRCS asm-primitives.cpp pixel.h ipfilter8.h)
-
-        # add ARM assembly/intrinsic files here
-        set(A_SRCS asm.S mc-a.S sad-a.S pixel-util.S ipfilter8.S)
-        set(VEC_PRIMITIVES)
+    set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h loopfilter.h)
 
-        set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
-        foreach(SRC ${C_SRCS})
-            set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
-        endforeach()
-    else()
-        set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h loopfilter.h)
+    # add ARM assembly/intrinsic files here
+    set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S ssd-a.S blockcopy8.S ipfilter8.S dct-a.S)
+    set(VEC_PRIMITIVES)
 
-        # add ARM assembly/intrinsic files here
-        set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S ssd-a.S blockcopy8.S ipfilter8.S dct-a.S)
-        set(VEC_PRIMITIVES)
+    set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
+    foreach(SRC ${C_SRCS})
+        set(ASM_PRIMITIVES ${ASM_PRIMITIVES} arm/${SRC})
+    endforeach()
+    source_group(Assembly FILES ${ASM_PRIMITIVES})
+endif(ENABLE_ASSEMBLY AND (ARM OR CROSS_COMPILE_ARM))
 
-        set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
-        foreach(SRC ${C_SRCS})
-            set(ASM_PRIMITIVES ${ASM_PRIMITIVES} arm/${SRC})
-        endforeach()
+if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64))
+    if(GCC AND (CMAKE_CXX_FLAGS_RELEASE MATCHES "-O3"))
+        message(STATUS "Detected CXX compiler using -O3 optimization level")
+        add_definitions(-DAUTO_VECTORIZE=1)
     endif()
+
+    set(C_SRCS asm-primitives.cpp pixel-prim.h pixel-prim.cpp filter-prim.h filter-prim.cpp dct-prim.h dct-prim.cpp loopfilter-prim.cpp loopfilter-prim.h intrapred-prim.cpp arm64-utils.cpp arm64-utils.h fun-decls.h)
+    enable_language(ASM)
+
+    # add ARM assembly/intrinsic files here
+    set(A_SRCS asm.S mc-a.S mc-a-common.S sad-a.S sad-a-common.S pixel-util.S pixel-util-common.S p2s.S p2s-common.S ipfilter.S ipfilter-common.S blockcopy8.S blockcopy8-common.S ssd-a.S ssd-a-common.S)
+    set(A_SRCS_SVE asm-sve.S blockcopy8-sve.S p2s-sve.S pixel-util-sve.S ssd-a-sve.S)
+    set(A_SRCS_SVE2 mc-a-sve2.S sad-a-sve2.S pixel-util-sve2.S ipfilter-sve2.S ssd-a-sve2.S)
+    set(VEC_PRIMITIVES)
+
+    set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
+    set(ARM_ASMS_SVE "${A_SRCS_SVE}" CACHE INTERNAL "ARM Assembly Sources that use SVE instruction set")
+    set(ARM_ASMS_SVE2 "${A_SRCS_SVE2}" CACHE INTERNAL "ARM Assembly Sources that use SVE2 instruction set")
+    foreach(SRC ${C_SRCS})
+        set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
+    endforeach()
     source_group(Assembly FILES ${ASM_PRIMITIVES})
-endif(ENABLE_ASSEMBLY AND (ARM OR CROSS_COMPILE_ARM))
+endif(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64))
 
 if(POWER)
     set_source_files_properties(version.cpp PROPERTIES COMPILE_FLAGS -DX265_VERSION=${X265_VERSION})
@@ -169,4 +176,6 @@
     scalinglist.cpp scalinglist.h
     quant.cpp quant.h contexts.h
     deblock.cpp deblock.h
-    scaler.cpp scaler.h)
+    scaler.cpp scaler.h
+    ringmem.cpp ringmem.h
+    temporalfilter.cpp temporalfilter.h)
​

x265_3.6.tar.gz/source/common/aarch64/arm64-utils.cpp Added

@@ -0,0 +1,300 @@
+#include "common.h"
+#include "x265.h"
+#include "arm64-utils.h"
+#include <arm_neon.h>
+
+#define COPY_16(d,s) *(uint8x16_t *)(d) = *(uint8x16_t *)(s)
+namespace X265_NS
+{
+
+
+
+void transpose8x8(uint8_t *dst, const uint8_t *src, intptr_t dstride, intptr_t sstride)
+{
+    uint8x8_t a0, a1, a2, a3, a4, a5, a6, a7;
+    uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7;
+
+    a0 = *(uint8x8_t *)(src + 0 * sstride);
+    a1 = *(uint8x8_t *)(src + 1 * sstride);
+    a2 = *(uint8x8_t *)(src + 2 * sstride);
+    a3 = *(uint8x8_t *)(src + 3 * sstride);
+    a4 = *(uint8x8_t *)(src + 4 * sstride);
+    a5 = *(uint8x8_t *)(src + 5 * sstride);
+    a6 = *(uint8x8_t *)(src + 6 * sstride);
+    a7 = *(uint8x8_t *)(src + 7 * sstride);
+
+    b0 = vtrn1_u32(a0, a4);
+    b1 = vtrn1_u32(a1, a5);
+    b2 = vtrn1_u32(a2, a6);
+    b3 = vtrn1_u32(a3, a7);
+    b4 = vtrn2_u32(a0, a4);
+    b5 = vtrn2_u32(a1, a5);
+    b6 = vtrn2_u32(a2, a6);
+    b7 = vtrn2_u32(a3, a7);
+
+    a0 = vtrn1_u16(b0, b2);
+    a1 = vtrn1_u16(b1, b3);
+    a2 = vtrn2_u16(b0, b2);
+    a3 = vtrn2_u16(b1, b3);
+    a4 = vtrn1_u16(b4, b6);
+    a5 = vtrn1_u16(b5, b7);
+    a6 = vtrn2_u16(b4, b6);
+    a7 = vtrn2_u16(b5, b7);
+
+    b0 = vtrn1_u8(a0, a1);
+    b1 = vtrn2_u8(a0, a1);
+    b2 = vtrn1_u8(a2, a3);
+    b3 = vtrn2_u8(a2, a3);
+    b4 = vtrn1_u8(a4, a5);
+    b5 = vtrn2_u8(a4, a5);
+    b6 = vtrn1_u8(a6, a7);
+    b7 = vtrn2_u8(a6, a7);
+
+    *(uint8x8_t *)(dst + 0 * dstride) = b0;
+    *(uint8x8_t *)(dst + 1 * dstride) = b1;
+    *(uint8x8_t *)(dst + 2 * dstride) = b2;
+    *(uint8x8_t *)(dst + 3 * dstride) = b3;
+    *(uint8x8_t *)(dst + 4 * dstride) = b4;
+    *(uint8x8_t *)(dst + 5 * dstride) = b5;
+    *(uint8x8_t *)(dst + 6 * dstride) = b6;
+    *(uint8x8_t *)(dst + 7 * dstride) = b7;
+}
+
+
+
+
+
+
+void transpose16x16(uint8_t *dst, const uint8_t *src, intptr_t dstride, intptr_t sstride)
+{
+    uint16x8_t a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, aA, aB, aC, aD, aE, aF;
+    uint16x8_t b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, bA, bB, bC, bD, bE, bF;
+    uint16x8_t c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF;
+    uint16x8_t d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, dA, dB, dC, dD, dE, dF;
+
+    a0 = *(uint16x8_t *)(src + 0 * sstride);
+    a1 = *(uint16x8_t *)(src + 1 * sstride);
+    a2 = *(uint16x8_t *)(src + 2 * sstride);
+    a3 = *(uint16x8_t *)(src + 3 * sstride);
+    a4 = *(uint16x8_t *)(src + 4 * sstride);
+    a5 = *(uint16x8_t *)(src + 5 * sstride);
+    a6 = *(uint16x8_t *)(src + 6 * sstride);
+    a7 = *(uint16x8_t *)(src + 7 * sstride);
+    a8 = *(uint16x8_t *)(src + 8 * sstride);
+    a9 = *(uint16x8_t *)(src + 9 * sstride);
+    aA = *(uint16x8_t *)(src + 10 * sstride);
+    aB = *(uint16x8_t *)(src + 11 * sstride);
+    aC = *(uint16x8_t *)(src + 12 * sstride);
+    aD = *(uint16x8_t *)(src + 13 * sstride);
+    aE = *(uint16x8_t *)(src + 14 * sstride);
+    aF = *(uint16x8_t *)(src + 15 * sstride);
+
+    b0 = vtrn1q_u64(a0, a8);
+    b1 = vtrn1q_u64(a1, a9);
+    b2 = vtrn1q_u64(a2, aA);
+    b3 = vtrn1q_u64(a3, aB);
+    b4 = vtrn1q_u64(a4, aC);
+    b5 = vtrn1q_u64(a5, aD);
+    b6 = vtrn1q_u64(a6, aE);
+    b7 = vtrn1q_u64(a7, aF);
+    b8 = vtrn2q_u64(a0, a8);
+    b9 = vtrn2q_u64(a1, a9);
+    bA = vtrn2q_u64(a2, aA);
+    bB = vtrn2q_u64(a3, aB);
+    bC = vtrn2q_u64(a4, aC);
+    bD = vtrn2q_u64(a5, aD);
+    bE = vtrn2q_u64(a6, aE);
+    bF = vtrn2q_u64(a7, aF);
+
+    c0 = vtrn1q_u32(b0, b4);
+    c1 = vtrn1q_u32(b1, b5);
+    c2 = vtrn1q_u32(b2, b6);
+    c3 = vtrn1q_u32(b3, b7);
+    c4 = vtrn2q_u32(b0, b4);
+    c5 = vtrn2q_u32(b1, b5);
+    c6 = vtrn2q_u32(b2, b6);
+    c7 = vtrn2q_u32(b3, b7);
+    c8 = vtrn1q_u32(b8, bC);
+    c9 = vtrn1q_u32(b9, bD);
+    cA = vtrn1q_u32(bA, bE);
+    cB = vtrn1q_u32(bB, bF);
+    cC = vtrn2q_u32(b8, bC);
+    cD = vtrn2q_u32(b9, bD);
+    cE = vtrn2q_u32(bA, bE);
+    cF = vtrn2q_u32(bB, bF);
+
+    d0 = vtrn1q_u16(c0, c2);
+    d1 = vtrn1q_u16(c1, c3);
+    d2 = vtrn2q_u16(c0, c2);
+    d3 = vtrn2q_u16(c1, c3);
+    d4 = vtrn1q_u16(c4, c6);
+    d5 = vtrn1q_u16(c5, c7);
+    d6 = vtrn2q_u16(c4, c6);
+    d7 = vtrn2q_u16(c5, c7);
+    d8 = vtrn1q_u16(c8, cA);
+    d9 = vtrn1q_u16(c9, cB);
+    dA = vtrn2q_u16(c8, cA);
+    dB = vtrn2q_u16(c9, cB);
+    dC = vtrn1q_u16(cC, cE);
+    dD = vtrn1q_u16(cD, cF);
+    dE = vtrn2q_u16(cC, cE);
+    dF = vtrn2q_u16(cD, cF);
+
+    *(uint16x8_t *)(dst + 0 * dstride)  = vtrn1q_u8(d0, d1);
+    *(uint16x8_t *)(dst + 1 * dstride)  = vtrn2q_u8(d0, d1);
+    *(uint16x8_t *)(dst + 2 * dstride)  = vtrn1q_u8(d2, d3);
+    *(uint16x8_t *)(dst + 3 * dstride)  = vtrn2q_u8(d2, d3);
+    *(uint16x8_t *)(dst + 4 * dstride)  = vtrn1q_u8(d4, d5);
+    *(uint16x8_t *)(dst + 5 * dstride)  = vtrn2q_u8(d4, d5);
+    *(uint16x8_t *)(dst + 6 * dstride)  = vtrn1q_u8(d6, d7);
+    *(uint16x8_t *)(dst + 7 * dstride)  = vtrn2q_u8(d6, d7);
+    *(uint16x8_t *)(dst + 8 * dstride)  = vtrn1q_u8(d8, d9);
+    *(uint16x8_t *)(dst + 9 * dstride)  = vtrn2q_u8(d8, d9);
+    *(uint16x8_t *)(dst + 10 * dstride)  = vtrn1q_u8(dA, dB);
+    *(uint16x8_t *)(dst + 11 * dstride)  = vtrn2q_u8(dA, dB);
+    *(uint16x8_t *)(dst + 12 * dstride)  = vtrn1q_u8(dC, dD);
+    *(uint16x8_t *)(dst + 13 * dstride)  = vtrn2q_u8(dC, dD);
+    *(uint16x8_t *)(dst + 14 * dstride)  = vtrn1q_u8(dE, dF);
+    *(uint16x8_t *)(dst + 15 * dstride)  = vtrn2q_u8(dE, dF);
+
+
+}
+
+
+void transpose32x32(uint8_t *dst, const uint8_t *src, intptr_t dstride, intptr_t sstride)
+{
+    //assumption: there is no partial overlap
+    transpose16x16(dst, src, dstride, sstride);
+    transpose16x16(dst + 16 * dstride + 16, src + 16 * sstride + 16, dstride, sstride);
+    if (dst == src)
+    {
+        uint8_t tmp16 * 16 __attribute__((aligned(64)));
+        transpose16x16(tmp, src + 16, 16, sstride);
+        transpose16x16(dst + 16, src + 16 * sstride, dstride, sstride);
+        for (int i = 0; i < 16; i++)
+        {
+            COPY_16(dst + (16 + i)*dstride, tmp + 16 * i);
+        }
+    }
+    else
+    {
+        transpose16x16(dst + 16 * dstride, src + 16, dstride, sstride);
+        transpose16x16(dst + 16, src + 16 * sstride, dstride, sstride);
+    }
+
+}
+
+
+
+void transpose8x8(uint16_t *dst, const uint16_t *src, intptr_t dstride, intptr_t sstride)
+{
+    uint16x8_t a0, a1, a2, a3, a4, a5, a6, a7;
+    uint16x8_t b0, b1, b2, b3, b4, b5, b6, b7;
+
+    a0 = *(uint16x8_t *)(src + 0 * sstride);
+    a1 = *(uint16x8_t *)(src + 1 * sstride);
+    a2 = *(uint16x8_t *)(src + 2 * sstride);
+    a3 = *(uint16x8_t *)(src + 3 * sstride);
+    a4 = *(uint16x8_t *)(src + 4 * sstride);
+    a5 = *(uint16x8_t *)(src + 5 * sstride);

 
@@ -0,0 +1,300 @@
+#include "common.h"
+#include "x265.h"
+#include "arm64-utils.h"
+#include <arm_neon.h>
+
+#define COPY_16(d,s) *(uint8x16_t *)(d) = *(uint8x16_t *)(s)
+namespace X265_NS
+{
+
+
+
+void transpose8x8(uint8_t *dst, const uint8_t *src, intptr_t dstride, intptr_t sstride)
+{
+    uint8x8_t a0, a1, a2, a3, a4, a5, a6, a7;
+    uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7;
+
+    a0 = *(uint8x8_t *)(src + 0 * sstride);
+    a1 = *(uint8x8_t *)(src + 1 * sstride);
+    a2 = *(uint8x8_t *)(src + 2 * sstride);
+    a3 = *(uint8x8_t *)(src + 3 * sstride);
+    a4 = *(uint8x8_t *)(src + 4 * sstride);
+    a5 = *(uint8x8_t *)(src + 5 * sstride);
+    a6 = *(uint8x8_t *)(src + 6 * sstride);
+    a7 = *(uint8x8_t *)(src + 7 * sstride);
+
+    b0 = vtrn1_u32(a0, a4);
+    b1 = vtrn1_u32(a1, a5);
+    b2 = vtrn1_u32(a2, a6);
+    b3 = vtrn1_u32(a3, a7);
+    b4 = vtrn2_u32(a0, a4);
+    b5 = vtrn2_u32(a1, a5);
+    b6 = vtrn2_u32(a2, a6);
+    b7 = vtrn2_u32(a3, a7);
+
+    a0 = vtrn1_u16(b0, b2);
+    a1 = vtrn1_u16(b1, b3);
+    a2 = vtrn2_u16(b0, b2);
+    a3 = vtrn2_u16(b1, b3);
+    a4 = vtrn1_u16(b4, b6);
+    a5 = vtrn1_u16(b5, b7);
+    a6 = vtrn2_u16(b4, b6);
+    a7 = vtrn2_u16(b5, b7);
+
+    b0 = vtrn1_u8(a0, a1);
+    b1 = vtrn2_u8(a0, a1);
+    b2 = vtrn1_u8(a2, a3);
+    b3 = vtrn2_u8(a2, a3);
+    b4 = vtrn1_u8(a4, a5);
+    b5 = vtrn2_u8(a4, a5);
+    b6 = vtrn1_u8(a6, a7);
+    b7 = vtrn2_u8(a6, a7);
+
+    *(uint8x8_t *)(dst + 0 * dstride) = b0;
+    *(uint8x8_t *)(dst + 1 * dstride) = b1;
+    *(uint8x8_t *)(dst + 2 * dstride) = b2;
+    *(uint8x8_t *)(dst + 3 * dstride) = b3;
+    *(uint8x8_t *)(dst + 4 * dstride) = b4;
+    *(uint8x8_t *)(dst + 5 * dstride) = b5;
+    *(uint8x8_t *)(dst + 6 * dstride) = b6;
+    *(uint8x8_t *)(dst + 7 * dstride) = b7;
+}
+
+
+
+
+
+
+void transpose16x16(uint8_t *dst, const uint8_t *src, intptr_t dstride, intptr_t sstride)
+{
+    uint16x8_t a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, aA, aB, aC, aD, aE, aF;
+    uint16x8_t b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, bA, bB, bC, bD, bE, bF;
+    uint16x8_t c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF;
+    uint16x8_t d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, dA, dB, dC, dD, dE, dF;
+
+    a0 = *(uint16x8_t *)(src + 0 * sstride);
+    a1 = *(uint16x8_t *)(src + 1 * sstride);
+    a2 = *(uint16x8_t *)(src + 2 * sstride);
+    a3 = *(uint16x8_t *)(src + 3 * sstride);
+    a4 = *(uint16x8_t *)(src + 4 * sstride);
+    a5 = *(uint16x8_t *)(src + 5 * sstride);
+    a6 = *(uint16x8_t *)(src + 6 * sstride);
+    a7 = *(uint16x8_t *)(src + 7 * sstride);
+    a8 = *(uint16x8_t *)(src + 8 * sstride);
+    a9 = *(uint16x8_t *)(src + 9 * sstride);
+    aA = *(uint16x8_t *)(src + 10 * sstride);
+    aB = *(uint16x8_t *)(src + 11 * sstride);
+    aC = *(uint16x8_t *)(src + 12 * sstride);
+    aD = *(uint16x8_t *)(src + 13 * sstride);
+    aE = *(uint16x8_t *)(src + 14 * sstride);
+    aF = *(uint16x8_t *)(src + 15 * sstride);
+
+    b0 = vtrn1q_u64(a0, a8);
+    b1 = vtrn1q_u64(a1, a9);
+    b2 = vtrn1q_u64(a2, aA);
+    b3 = vtrn1q_u64(a3, aB);
+    b4 = vtrn1q_u64(a4, aC);
+    b5 = vtrn1q_u64(a5, aD);
+    b6 = vtrn1q_u64(a6, aE);
+    b7 = vtrn1q_u64(a7, aF);
+    b8 = vtrn2q_u64(a0, a8);
+    b9 = vtrn2q_u64(a1, a9);
+    bA = vtrn2q_u64(a2, aA);
+    bB = vtrn2q_u64(a3, aB);
+    bC = vtrn2q_u64(a4, aC);
+    bD = vtrn2q_u64(a5, aD);
+    bE = vtrn2q_u64(a6, aE);
+    bF = vtrn2q_u64(a7, aF);
+
+    c0 = vtrn1q_u32(b0, b4);
+    c1 = vtrn1q_u32(b1, b5);
+    c2 = vtrn1q_u32(b2, b6);
+    c3 = vtrn1q_u32(b3, b7);
+    c4 = vtrn2q_u32(b0, b4);
+    c5 = vtrn2q_u32(b1, b5);
+    c6 = vtrn2q_u32(b2, b6);
+    c7 = vtrn2q_u32(b3, b7);
+    c8 = vtrn1q_u32(b8, bC);
+    c9 = vtrn1q_u32(b9, bD);
+    cA = vtrn1q_u32(bA, bE);
+    cB = vtrn1q_u32(bB, bF);
+    cC = vtrn2q_u32(b8, bC);
+    cD = vtrn2q_u32(b9, bD);
+    cE = vtrn2q_u32(bA, bE);
+    cF = vtrn2q_u32(bB, bF);
+
+    d0 = vtrn1q_u16(c0, c2);
+    d1 = vtrn1q_u16(c1, c3);
+    d2 = vtrn2q_u16(c0, c2);
+    d3 = vtrn2q_u16(c1, c3);
+    d4 = vtrn1q_u16(c4, c6);
+    d5 = vtrn1q_u16(c5, c7);
+    d6 = vtrn2q_u16(c4, c6);
+    d7 = vtrn2q_u16(c5, c7);
+    d8 = vtrn1q_u16(c8, cA);
+    d9 = vtrn1q_u16(c9, cB);
+    dA = vtrn2q_u16(c8, cA);
+    dB = vtrn2q_u16(c9, cB);
+    dC = vtrn1q_u16(cC, cE);
+    dD = vtrn1q_u16(cD, cF);
+    dE = vtrn2q_u16(cC, cE);
+    dF = vtrn2q_u16(cD, cF);
+
+    *(uint16x8_t *)(dst + 0 * dstride)  = vtrn1q_u8(d0, d1);
+    *(uint16x8_t *)(dst + 1 * dstride)  = vtrn2q_u8(d0, d1);
+    *(uint16x8_t *)(dst + 2 * dstride)  = vtrn1q_u8(d2, d3);
+    *(uint16x8_t *)(dst + 3 * dstride)  = vtrn2q_u8(d2, d3);
+    *(uint16x8_t *)(dst + 4 * dstride)  = vtrn1q_u8(d4, d5);
+    *(uint16x8_t *)(dst + 5 * dstride)  = vtrn2q_u8(d4, d5);
+    *(uint16x8_t *)(dst + 6 * dstride)  = vtrn1q_u8(d6, d7);
+    *(uint16x8_t *)(dst + 7 * dstride)  = vtrn2q_u8(d6, d7);
+    *(uint16x8_t *)(dst + 8 * dstride)  = vtrn1q_u8(d8, d9);
+    *(uint16x8_t *)(dst + 9 * dstride)  = vtrn2q_u8(d8, d9);
+    *(uint16x8_t *)(dst + 10 * dstride)  = vtrn1q_u8(dA, dB);
+    *(uint16x8_t *)(dst + 11 * dstride)  = vtrn2q_u8(dA, dB);
+    *(uint16x8_t *)(dst + 12 * dstride)  = vtrn1q_u8(dC, dD);
+    *(uint16x8_t *)(dst + 13 * dstride)  = vtrn2q_u8(dC, dD);
+    *(uint16x8_t *)(dst + 14 * dstride)  = vtrn1q_u8(dE, dF);
+    *(uint16x8_t *)(dst + 15 * dstride)  = vtrn2q_u8(dE, dF);
+
+
+}
+
+
+void transpose32x32(uint8_t *dst, const uint8_t *src, intptr_t dstride, intptr_t sstride)
+{
+    //assumption: there is no partial overlap
+    transpose16x16(dst, src, dstride, sstride);
+    transpose16x16(dst + 16 * dstride + 16, src + 16 * sstride + 16, dstride, sstride);
+    if (dst == src)
+    {
+        uint8_t tmp16 * 16 __attribute__((aligned(64)));
+        transpose16x16(tmp, src + 16, 16, sstride);
+        transpose16x16(dst + 16, src + 16 * sstride, dstride, sstride);
+        for (int i = 0; i < 16; i++)
+        {
+            COPY_16(dst + (16 + i)*dstride, tmp + 16 * i);
+        }
+    }
+    else
+    {
+        transpose16x16(dst + 16 * dstride, src + 16, dstride, sstride);
+        transpose16x16(dst + 16, src + 16 * sstride, dstride, sstride);
+    }
+
+}
+
+
+
+void transpose8x8(uint16_t *dst, const uint16_t *src, intptr_t dstride, intptr_t sstride)
+{
+    uint16x8_t a0, a1, a2, a3, a4, a5, a6, a7;
+    uint16x8_t b0, b1, b2, b3, b4, b5, b6, b7;
+
+    a0 = *(uint16x8_t *)(src + 0 * sstride);
+    a1 = *(uint16x8_t *)(src + 1 * sstride);
+    a2 = *(uint16x8_t *)(src + 2 * sstride);
+    a3 = *(uint16x8_t *)(src + 3 * sstride);
+    a4 = *(uint16x8_t *)(src + 4 * sstride);
+    a5 = *(uint16x8_t *)(src + 5 * sstride);
​

x265_3.6.tar.gz/source/common/aarch64/arm64-utils.h Added

 
@@ -0,0 +1,15 @@
+#ifndef __ARM64_UTILS_H__
+#define __ARM64_UTILS_H__
+
+
+namespace X265_NS
+{
+void transpose8x8(uint8_t *dst, const uint8_t *src, intptr_t dstride, intptr_t sstride);
+void transpose16x16(uint8_t *dst, const uint8_t *src, intptr_t dstride, intptr_t sstride);
+void transpose32x32(uint8_t *dst, const uint8_t *src, intptr_t dstride, intptr_t sstride);
+void transpose8x8(uint16_t *dst, const uint16_t *src, intptr_t dstride, intptr_t sstride);
+void transpose16x16(uint16_t *dst, const uint16_t *src, intptr_t dstride, intptr_t sstride);
+void transpose32x32(uint16_t *dst, const uint16_t *src, intptr_t dstride, intptr_t sstride);
+}
+
+#endif
​

x265_3.5.tar.gz/source/common/aarch64/asm-primitives.cpp -> x265_3.6.tar.gz/source/common/aarch64/asm-primitives.cpp Changed

@@ -3,6 +3,7 @@
  *
  * Authors: Hongbin Liu <liuhongbin1@huawei.com>
  *          Yimeng Su <yimeng.su@huawei.com>
+ *          Sebastian Pop <spop@amazon.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -22,11 +23,659 @@
  * For more information, contact us at license @ x265.com.
  *****************************************************************************/
 
+
 #include "common.h"
 #include "primitives.h"
 #include "x265.h"
 #include "cpu.h"
 
+extern "C" {
+#include "fun-decls.h"
+}
+
+#define ALL_LUMA_TU_TYPED(prim, fncdef, fname, cpu) \
+    p.cuBLOCK_4x4.prim   = fncdef PFX(fname ## _4x4_ ## cpu); \
+    p.cuBLOCK_8x8.prim   = fncdef PFX(fname ## _8x8_ ## cpu); \
+    p.cuBLOCK_16x16.prim = fncdef PFX(fname ## _16x16_ ## cpu); \
+    p.cuBLOCK_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
+    p.cuBLOCK_64x64.prim = fncdef PFX(fname ## _64x64_ ## cpu)
+#define LUMA_TU_TYPED_NEON(prim, fncdef, fname) \
+    p.cuBLOCK_4x4.prim   = fncdef PFX(fname ## _4x4_ ## neon); \
+    p.cuBLOCK_8x8.prim   = fncdef PFX(fname ## _8x8_ ## neon); \
+    p.cuBLOCK_16x16.prim = fncdef PFX(fname ## _16x16_ ## neon); \
+    p.cuBLOCK_64x64.prim = fncdef PFX(fname ## _64x64_ ## neon)
+#define LUMA_TU_TYPED_CAN_USE_SVE(prim, fncdef, fname) \
+    p.cuBLOCK_32x32.prim = fncdef PFX(fname ## _32x32_ ## sve)
+#define ALL_LUMA_TU(prim, fname, cpu)      ALL_LUMA_TU_TYPED(prim, , fname, cpu)
+#define LUMA_TU_NEON(prim, fname)      LUMA_TU_TYPED_NEON(prim, , fname)
+#define LUMA_TU_CAN_USE_SVE(prim, fname)      LUMA_TU_TYPED_CAN_USE_SVE(prim, , fname)
+
+#define ALL_LUMA_PU_TYPED(prim, fncdef, fname, cpu) \
+    p.puLUMA_4x4.prim   = fncdef PFX(fname ## _4x4_ ## cpu); \
+    p.puLUMA_8x8.prim   = fncdef PFX(fname ## _8x8_ ## cpu); \
+    p.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## cpu); \
+    p.puLUMA_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
+    p.puLUMA_64x64.prim = fncdef PFX(fname ## _64x64_ ## cpu); \
+    p.puLUMA_8x4.prim   = fncdef PFX(fname ## _8x4_ ## cpu); \
+    p.puLUMA_4x8.prim   = fncdef PFX(fname ## _4x8_ ## cpu); \
+    p.puLUMA_16x8.prim  = fncdef PFX(fname ## _16x8_ ## cpu); \
+    p.puLUMA_8x16.prim  = fncdef PFX(fname ## _8x16_ ## cpu); \
+    p.puLUMA_16x32.prim = fncdef PFX(fname ## _16x32_ ## cpu); \
+    p.puLUMA_32x16.prim = fncdef PFX(fname ## _32x16_ ## cpu); \
+    p.puLUMA_64x32.prim = fncdef PFX(fname ## _64x32_ ## cpu); \
+    p.puLUMA_32x64.prim = fncdef PFX(fname ## _32x64_ ## cpu); \
+    p.puLUMA_16x12.prim = fncdef PFX(fname ## _16x12_ ## cpu); \
+    p.puLUMA_12x16.prim = fncdef PFX(fname ## _12x16_ ## cpu); \
+    p.puLUMA_16x4.prim  = fncdef PFX(fname ## _16x4_ ## cpu); \
+    p.puLUMA_4x16.prim  = fncdef PFX(fname ## _4x16_ ## cpu); \
+    p.puLUMA_32x24.prim = fncdef PFX(fname ## _32x24_ ## cpu); \
+    p.puLUMA_24x32.prim = fncdef PFX(fname ## _24x32_ ## cpu); \
+    p.puLUMA_32x8.prim  = fncdef PFX(fname ## _32x8_ ## cpu); \
+    p.puLUMA_8x32.prim  = fncdef PFX(fname ## _8x32_ ## cpu); \
+    p.puLUMA_64x48.prim = fncdef PFX(fname ## _64x48_ ## cpu); \
+    p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## cpu); \
+    p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## cpu); \
+    p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## cpu)
+#define LUMA_PU_TYPED_MULTIPLE_ARCHS_1(prim, fncdef, fname, cpu) \
+    p.puLUMA_4x4.prim   = fncdef PFX(fname ## _4x4_ ## cpu); \
+    p.puLUMA_4x8.prim   = fncdef PFX(fname ## _4x8_ ## cpu); \
+    p.puLUMA_4x16.prim  = fncdef PFX(fname ## _4x16_ ## cpu)
+#define LUMA_PU_TYPED_MULTIPLE_ARCHS_2(prim, fncdef, fname, cpu) \
+    p.puLUMA_8x8.prim   = fncdef PFX(fname ## _8x8_ ## cpu); \
+    p.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## cpu); \
+    p.puLUMA_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
+    p.puLUMA_64x64.prim = fncdef PFX(fname ## _64x64_ ## cpu); \
+    p.puLUMA_8x4.prim   = fncdef PFX(fname ## _8x4_ ## cpu); \
+    p.puLUMA_16x8.prim  = fncdef PFX(fname ## _16x8_ ## cpu); \
+    p.puLUMA_8x16.prim  = fncdef PFX(fname ## _8x16_ ## cpu); \
+    p.puLUMA_16x32.prim = fncdef PFX(fname ## _16x32_ ## cpu); \
+    p.puLUMA_32x16.prim = fncdef PFX(fname ## _32x16_ ## cpu); \
+    p.puLUMA_64x32.prim = fncdef PFX(fname ## _64x32_ ## cpu); \
+    p.puLUMA_32x64.prim = fncdef PFX(fname ## _32x64_ ## cpu); \
+    p.puLUMA_16x12.prim = fncdef PFX(fname ## _16x12_ ## cpu); \
+    p.puLUMA_12x16.prim = fncdef PFX(fname ## _12x16_ ## cpu); \
+    p.puLUMA_16x4.prim  = fncdef PFX(fname ## _16x4_ ## cpu); \
+    p.puLUMA_32x24.prim = fncdef PFX(fname ## _32x24_ ## cpu); \
+    p.puLUMA_24x32.prim = fncdef PFX(fname ## _24x32_ ## cpu); \
+    p.puLUMA_32x8.prim  = fncdef PFX(fname ## _32x8_ ## cpu); \
+    p.puLUMA_8x32.prim  = fncdef PFX(fname ## _8x32_ ## cpu); \
+    p.puLUMA_64x48.prim = fncdef PFX(fname ## _64x48_ ## cpu); \
+    p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## cpu); \
+    p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## cpu); \
+    p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## cpu)
+#define LUMA_PU_TYPED_NEON_1(prim, fncdef, fname) \
+    p.puLUMA_4x4.prim   = fncdef PFX(fname ## _4x4_ ## neon); \
+    p.puLUMA_4x8.prim   = fncdef PFX(fname ## _4x8_ ## neon); \
+    p.puLUMA_4x16.prim  = fncdef PFX(fname ## _4x16_ ## neon); \
+    p.puLUMA_12x16.prim = fncdef PFX(fname ## _12x16_ ## neon); \
+    p.puLUMA_8x8.prim   = fncdef PFX(fname ## _8x8_ ## neon); \
+    p.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## neon); \
+    p.puLUMA_8x4.prim   = fncdef PFX(fname ## _8x4_ ## neon); \
+    p.puLUMA_16x8.prim  = fncdef PFX(fname ## _16x8_ ## neon); \
+    p.puLUMA_8x16.prim  = fncdef PFX(fname ## _8x16_ ## neon); \
+    p.puLUMA_16x12.prim = fncdef PFX(fname ## _16x12_ ## neon); \
+    p.puLUMA_16x32.prim = fncdef PFX(fname ## _16x32_ ## neon); \
+    p.puLUMA_16x4.prim  = fncdef PFX(fname ## _16x4_ ## neon); \
+    p.puLUMA_24x32.prim = fncdef PFX(fname ## _24x32_ ## neon); \
+    p.puLUMA_8x32.prim  = fncdef PFX(fname ## _8x32_ ## neon); \
+    p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## neon); \
+    p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## neon)
+#define LUMA_PU_TYPED_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(prim, fncdef, fname) \
+    p.puLUMA_32x32.prim = fncdef PFX(fname ## _32x32_ ## sve); \
+    p.puLUMA_64x64.prim = fncdef PFX(fname ## _64x64_ ## sve); \
+    p.puLUMA_32x16.prim = fncdef PFX(fname ## _32x16_ ## sve); \
+    p.puLUMA_64x32.prim = fncdef PFX(fname ## _64x32_ ## sve); \
+    p.puLUMA_32x64.prim = fncdef PFX(fname ## _32x64_ ## sve); \
+    p.puLUMA_32x24.prim = fncdef PFX(fname ## _32x24_ ## sve); \
+    p.puLUMA_32x8.prim  = fncdef PFX(fname ## _32x8_ ## sve); \
+    p.puLUMA_64x48.prim = fncdef PFX(fname ## _64x48_ ## sve); \
+    p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## sve)
+#define LUMA_PU_TYPED_NEON_2(prim, fncdef, fname) \
+    p.puLUMA_4x4.prim   = fncdef PFX(fname ## _4x4_ ## neon); \
+    p.puLUMA_8x4.prim   = fncdef PFX(fname ## _8x4_ ## neon); \
+    p.puLUMA_4x8.prim   = fncdef PFX(fname ## _4x8_ ## neon); \
+    p.puLUMA_8x8.prim   = fncdef PFX(fname ## _8x8_ ## neon); \
+    p.puLUMA_16x8.prim  = fncdef PFX(fname ## _16x8_ ## neon); \
+    p.puLUMA_8x16.prim  = fncdef PFX(fname ## _8x16_ ## neon); \
+    p.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## neon); \
+    p.puLUMA_16x32.prim = fncdef PFX(fname ## _16x32_ ## neon); \
+    p.puLUMA_16x12.prim = fncdef PFX(fname ## _16x12_ ## neon); \
+    p.puLUMA_16x4.prim  = fncdef PFX(fname ## _16x4_ ## neon); \
+    p.puLUMA_4x16.prim  = fncdef PFX(fname ## _4x16_ ## neon); \
+    p.puLUMA_8x32.prim  = fncdef PFX(fname ## _8x32_ ## neon); \
+    p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## neon)
+#define LUMA_PU_TYPED_MULTIPLE_ARCHS_3(prim, fncdef, fname, cpu) \
+    p.puLUMA_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
+    p.puLUMA_64x64.prim = fncdef PFX(fname ## _64x64_ ## cpu); \
+    p.puLUMA_32x16.prim = fncdef PFX(fname ## _32x16_ ## cpu); \
+    p.puLUMA_64x32.prim = fncdef PFX(fname ## _64x32_ ## cpu); \
+    p.puLUMA_32x64.prim = fncdef PFX(fname ## _32x64_ ## cpu); \
+    p.puLUMA_12x16.prim = fncdef PFX(fname ## _12x16_ ## cpu); \
+    p.puLUMA_32x24.prim = fncdef PFX(fname ## _32x24_ ## cpu); \
+    p.puLUMA_24x32.prim = fncdef PFX(fname ## _24x32_ ## cpu); \
+    p.puLUMA_32x8.prim  = fncdef PFX(fname ## _32x8_ ## cpu); \
+    p.puLUMA_64x48.prim = fncdef PFX(fname ## _64x48_ ## cpu); \
+    p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## cpu); \
+    p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## cpu)
+#define LUMA_PU_TYPED_NEON_3(prim, fncdef, fname) \
+    p.puLUMA_4x4.prim   = fncdef PFX(fname ## _4x4_ ## neon); \
+    p.puLUMA_4x8.prim   = fncdef PFX(fname ## _4x8_ ## neon); \
+    p.puLUMA_4x16.prim  = fncdef PFX(fname ## _4x16_ ## neon)
+#define LUMA_PU_TYPED_CAN_USE_SVE2(prim, fncdef, fname) \
+    p.puLUMA_8x8.prim   = fncdef PFX(fname ## _8x8_ ## sve2); \
+    p.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## sve2); \
+    p.puLUMA_32x32.prim = fncdef PFX(fname ## _32x32_ ## sve2); \
+    p.puLUMA_64x64.prim = fncdef PFX(fname ## _64x64_ ## sve2); \
+    p.puLUMA_8x4.prim   = fncdef PFX(fname ## _8x4_ ## sve2); \
+    p.puLUMA_16x8.prim  = fncdef PFX(fname ## _16x8_ ## sve2); \
+    p.puLUMA_8x16.prim  = fncdef PFX(fname ## _8x16_ ## sve2); \
+    p.puLUMA_16x32.prim = fncdef PFX(fname ## _16x32_ ## sve2); \
+    p.puLUMA_32x16.prim = fncdef PFX(fname ## _32x16_ ## sve2); \
+    p.puLUMA_64x32.prim = fncdef PFX(fname ## _64x32_ ## sve2); \
+    p.puLUMA_32x64.prim = fncdef PFX(fname ## _32x64_ ## sve2); \
+    p.puLUMA_16x12.prim = fncdef PFX(fname ## _16x12_ ## sve2); \
+    p.puLUMA_12x16.prim = fncdef PFX(fname ## _12x16_ ## sve2); \
+    p.puLUMA_16x4.prim  = fncdef PFX(fname ## _16x4_ ## sve2); \
+    p.puLUMA_32x24.prim = fncdef PFX(fname ## _32x24_ ## sve2); \
+    p.puLUMA_24x32.prim = fncdef PFX(fname ## _24x32_ ## sve2); \
+    p.puLUMA_32x8.prim  = fncdef PFX(fname ## _32x8_ ## sve2); \
+    p.puLUMA_8x32.prim  = fncdef PFX(fname ## _8x32_ ## sve2); \
+    p.puLUMA_64x48.prim = fncdef PFX(fname ## _64x48_ ## sve2); \
+    p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## sve2); \
+    p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## sve2); \
+    p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## sve2)
+#define LUMA_PU_TYPED_NEON_FILTER_PIXEL_TO_SHORT(prim, fncdef) \
+    p.puLUMA_4x4.prim   = fncdef PFX(filterPixelToShort ## _4x4_ ## neon); \
+    p.puLUMA_8x8.prim   = fncdef PFX(filterPixelToShort ## _8x8_ ## neon); \
+    p.puLUMA_16x16.prim = fncdef PFX(filterPixelToShort ## _16x16_ ## neon); \
+    p.puLUMA_8x4.prim   = fncdef PFX(filterPixelToShort ## _8x4_ ## neon); \
+    p.puLUMA_4x8.prim   = fncdef PFX(filterPixelToShort ## _4x8_ ## neon); \
+    p.puLUMA_16x8.prim  = fncdef PFX(filterPixelToShort ## _16x8_ ## neon); \
+    p.puLUMA_8x16.prim  = fncdef PFX(filterPixelToShort ## _8x16_ ## neon); \
+    p.puLUMA_16x32.prim = fncdef PFX(filterPixelToShort ## _16x32_ ## neon); \
+    p.puLUMA_16x12.prim = fncdef PFX(filterPixelToShort ## _16x12_ ## neon); \
+    p.puLUMA_12x16.prim = fncdef PFX(filterPixelToShort ## _12x16_ ## neon); \
+    p.puLUMA_16x4.prim  = fncdef PFX(filterPixelToShort ## _16x4_ ## neon); \
+    p.puLUMA_4x16.prim  = fncdef PFX(filterPixelToShort ## _4x16_ ## neon); \
+    p.puLUMA_24x32.prim = fncdef PFX(filterPixelToShort ## _24x32_ ## neon); \
+    p.puLUMA_8x32.prim  = fncdef PFX(filterPixelToShort ## _8x32_ ## neon); \
+    p.puLUMA_16x64.prim = fncdef PFX(filterPixelToShort ## _16x64_ ## neon)
+#define LUMA_PU_TYPED_SVE_FILTER_PIXEL_TO_SHORT(prim, fncdef) \
+    p.puLUMA_32x32.prim = fncdef PFX(filterPixelToShort ## _32x32_ ## sve); \
+    p.puLUMA_32x16.prim = fncdef PFX(filterPixelToShort ## _32x16_ ## sve); \
+    p.puLUMA_32x64.prim = fncdef PFX(filterPixelToShort ## _32x64_ ## sve); \
+    p.puLUMA_32x24.prim = fncdef PFX(filterPixelToShort ## _32x24_ ## sve); \
+    p.puLUMA_32x8.prim  = fncdef PFX(filterPixelToShort ## _32x8_ ## sve); \
+    p.puLUMA_64x64.prim = fncdef PFX(filterPixelToShort ## _64x64_ ## sve); \
+    p.puLUMA_64x32.prim = fncdef PFX(filterPixelToShort ## _64x32_ ## sve); \
+    p.puLUMA_64x48.prim = fncdef PFX(filterPixelToShort ## _64x48_ ## sve); \
+    p.puLUMA_64x16.prim = fncdef PFX(filterPixelToShort ## _64x16_ ## sve); \
+    p.puLUMA_48x64.prim = fncdef PFX(filterPixelToShort ## _48x64_ ## sve)

 
@@ -3,6 +3,7 @@
  *
  * Authors: Hongbin Liu <liuhongbin1@huawei.com>
  *          Yimeng Su <yimeng.su@huawei.com>
+ *          Sebastian Pop <spop@amazon.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -22,11 +23,659 @@
  * For more information, contact us at license @ x265.com.
  *****************************************************************************/
 
+
 #include "common.h"
 #include "primitives.h"
 #include "x265.h"
 #include "cpu.h"
 
+extern "C" {
+#include "fun-decls.h"
+}
+
+#define ALL_LUMA_TU_TYPED(prim, fncdef, fname, cpu) \
+    p.cuBLOCK_4x4.prim   = fncdef PFX(fname ## _4x4_ ## cpu); \
+    p.cuBLOCK_8x8.prim   = fncdef PFX(fname ## _8x8_ ## cpu); \
+    p.cuBLOCK_16x16.prim = fncdef PFX(fname ## _16x16_ ## cpu); \
+    p.cuBLOCK_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
+    p.cuBLOCK_64x64.prim = fncdef PFX(fname ## _64x64_ ## cpu)
+#define LUMA_TU_TYPED_NEON(prim, fncdef, fname) \
+    p.cuBLOCK_4x4.prim   = fncdef PFX(fname ## _4x4_ ## neon); \
+    p.cuBLOCK_8x8.prim   = fncdef PFX(fname ## _8x8_ ## neon); \
+    p.cuBLOCK_16x16.prim = fncdef PFX(fname ## _16x16_ ## neon); \
+    p.cuBLOCK_64x64.prim = fncdef PFX(fname ## _64x64_ ## neon)
+#define LUMA_TU_TYPED_CAN_USE_SVE(prim, fncdef, fname) \
+    p.cuBLOCK_32x32.prim = fncdef PFX(fname ## _32x32_ ## sve)
+#define ALL_LUMA_TU(prim, fname, cpu)      ALL_LUMA_TU_TYPED(prim, , fname, cpu)
+#define LUMA_TU_NEON(prim, fname)      LUMA_TU_TYPED_NEON(prim, , fname)
+#define LUMA_TU_CAN_USE_SVE(prim, fname)      LUMA_TU_TYPED_CAN_USE_SVE(prim, , fname)
+
+#define ALL_LUMA_PU_TYPED(prim, fncdef, fname, cpu) \
+    p.puLUMA_4x4.prim   = fncdef PFX(fname ## _4x4_ ## cpu); \
+    p.puLUMA_8x8.prim   = fncdef PFX(fname ## _8x8_ ## cpu); \
+    p.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## cpu); \
+    p.puLUMA_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
+    p.puLUMA_64x64.prim = fncdef PFX(fname ## _64x64_ ## cpu); \
+    p.puLUMA_8x4.prim   = fncdef PFX(fname ## _8x4_ ## cpu); \
+    p.puLUMA_4x8.prim   = fncdef PFX(fname ## _4x8_ ## cpu); \
+    p.puLUMA_16x8.prim  = fncdef PFX(fname ## _16x8_ ## cpu); \
+    p.puLUMA_8x16.prim  = fncdef PFX(fname ## _8x16_ ## cpu); \
+    p.puLUMA_16x32.prim = fncdef PFX(fname ## _16x32_ ## cpu); \
+    p.puLUMA_32x16.prim = fncdef PFX(fname ## _32x16_ ## cpu); \
+    p.puLUMA_64x32.prim = fncdef PFX(fname ## _64x32_ ## cpu); \
+    p.puLUMA_32x64.prim = fncdef PFX(fname ## _32x64_ ## cpu); \
+    p.puLUMA_16x12.prim = fncdef PFX(fname ## _16x12_ ## cpu); \
+    p.puLUMA_12x16.prim = fncdef PFX(fname ## _12x16_ ## cpu); \
+    p.puLUMA_16x4.prim  = fncdef PFX(fname ## _16x4_ ## cpu); \
+    p.puLUMA_4x16.prim  = fncdef PFX(fname ## _4x16_ ## cpu); \
+    p.puLUMA_32x24.prim = fncdef PFX(fname ## _32x24_ ## cpu); \
+    p.puLUMA_24x32.prim = fncdef PFX(fname ## _24x32_ ## cpu); \
+    p.puLUMA_32x8.prim  = fncdef PFX(fname ## _32x8_ ## cpu); \
+    p.puLUMA_8x32.prim  = fncdef PFX(fname ## _8x32_ ## cpu); \
+    p.puLUMA_64x48.prim = fncdef PFX(fname ## _64x48_ ## cpu); \
+    p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## cpu); \
+    p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## cpu); \
+    p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## cpu)
+#define LUMA_PU_TYPED_MULTIPLE_ARCHS_1(prim, fncdef, fname, cpu) \
+    p.puLUMA_4x4.prim   = fncdef PFX(fname ## _4x4_ ## cpu); \
+    p.puLUMA_4x8.prim   = fncdef PFX(fname ## _4x8_ ## cpu); \
+    p.puLUMA_4x16.prim  = fncdef PFX(fname ## _4x16_ ## cpu)
+#define LUMA_PU_TYPED_MULTIPLE_ARCHS_2(prim, fncdef, fname, cpu) \
+    p.puLUMA_8x8.prim   = fncdef PFX(fname ## _8x8_ ## cpu); \
+    p.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## cpu); \
+    p.puLUMA_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
+    p.puLUMA_64x64.prim = fncdef PFX(fname ## _64x64_ ## cpu); \
+    p.puLUMA_8x4.prim   = fncdef PFX(fname ## _8x4_ ## cpu); \
+    p.puLUMA_16x8.prim  = fncdef PFX(fname ## _16x8_ ## cpu); \
+    p.puLUMA_8x16.prim  = fncdef PFX(fname ## _8x16_ ## cpu); \
+    p.puLUMA_16x32.prim = fncdef PFX(fname ## _16x32_ ## cpu); \
+    p.puLUMA_32x16.prim = fncdef PFX(fname ## _32x16_ ## cpu); \
+    p.puLUMA_64x32.prim = fncdef PFX(fname ## _64x32_ ## cpu); \
+    p.puLUMA_32x64.prim = fncdef PFX(fname ## _32x64_ ## cpu); \
+    p.puLUMA_16x12.prim = fncdef PFX(fname ## _16x12_ ## cpu); \
+    p.puLUMA_12x16.prim = fncdef PFX(fname ## _12x16_ ## cpu); \
+    p.puLUMA_16x4.prim  = fncdef PFX(fname ## _16x4_ ## cpu); \
+    p.puLUMA_32x24.prim = fncdef PFX(fname ## _32x24_ ## cpu); \
+    p.puLUMA_24x32.prim = fncdef PFX(fname ## _24x32_ ## cpu); \
+    p.puLUMA_32x8.prim  = fncdef PFX(fname ## _32x8_ ## cpu); \
+    p.puLUMA_8x32.prim  = fncdef PFX(fname ## _8x32_ ## cpu); \
+    p.puLUMA_64x48.prim = fncdef PFX(fname ## _64x48_ ## cpu); \
+    p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## cpu); \
+    p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## cpu); \
+    p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## cpu)
+#define LUMA_PU_TYPED_NEON_1(prim, fncdef, fname) \
+    p.puLUMA_4x4.prim   = fncdef PFX(fname ## _4x4_ ## neon); \
+    p.puLUMA_4x8.prim   = fncdef PFX(fname ## _4x8_ ## neon); \
+    p.puLUMA_4x16.prim  = fncdef PFX(fname ## _4x16_ ## neon); \
+    p.puLUMA_12x16.prim = fncdef PFX(fname ## _12x16_ ## neon); \
+    p.puLUMA_8x8.prim   = fncdef PFX(fname ## _8x8_ ## neon); \
+    p.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## neon); \
+    p.puLUMA_8x4.prim   = fncdef PFX(fname ## _8x4_ ## neon); \
+    p.puLUMA_16x8.prim  = fncdef PFX(fname ## _16x8_ ## neon); \
+    p.puLUMA_8x16.prim  = fncdef PFX(fname ## _8x16_ ## neon); \
+    p.puLUMA_16x12.prim = fncdef PFX(fname ## _16x12_ ## neon); \
+    p.puLUMA_16x32.prim = fncdef PFX(fname ## _16x32_ ## neon); \
+    p.puLUMA_16x4.prim  = fncdef PFX(fname ## _16x4_ ## neon); \
+    p.puLUMA_24x32.prim = fncdef PFX(fname ## _24x32_ ## neon); \
+    p.puLUMA_8x32.prim  = fncdef PFX(fname ## _8x32_ ## neon); \
+    p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## neon); \
+    p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## neon)
+#define LUMA_PU_TYPED_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(prim, fncdef, fname) \
+    p.puLUMA_32x32.prim = fncdef PFX(fname ## _32x32_ ## sve); \
+    p.puLUMA_64x64.prim = fncdef PFX(fname ## _64x64_ ## sve); \
+    p.puLUMA_32x16.prim = fncdef PFX(fname ## _32x16_ ## sve); \
+    p.puLUMA_64x32.prim = fncdef PFX(fname ## _64x32_ ## sve); \
+    p.puLUMA_32x64.prim = fncdef PFX(fname ## _32x64_ ## sve); \
+    p.puLUMA_32x24.prim = fncdef PFX(fname ## _32x24_ ## sve); \
+    p.puLUMA_32x8.prim  = fncdef PFX(fname ## _32x8_ ## sve); \
+    p.puLUMA_64x48.prim = fncdef PFX(fname ## _64x48_ ## sve); \
+    p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## sve)
+#define LUMA_PU_TYPED_NEON_2(prim, fncdef, fname) \
+    p.puLUMA_4x4.prim   = fncdef PFX(fname ## _4x4_ ## neon); \
+    p.puLUMA_8x4.prim   = fncdef PFX(fname ## _8x4_ ## neon); \
+    p.puLUMA_4x8.prim   = fncdef PFX(fname ## _4x8_ ## neon); \
+    p.puLUMA_8x8.prim   = fncdef PFX(fname ## _8x8_ ## neon); \
+    p.puLUMA_16x8.prim  = fncdef PFX(fname ## _16x8_ ## neon); \
+    p.puLUMA_8x16.prim  = fncdef PFX(fname ## _8x16_ ## neon); \
+    p.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## neon); \
+    p.puLUMA_16x32.prim = fncdef PFX(fname ## _16x32_ ## neon); \
+    p.puLUMA_16x12.prim = fncdef PFX(fname ## _16x12_ ## neon); \
+    p.puLUMA_16x4.prim  = fncdef PFX(fname ## _16x4_ ## neon); \
+    p.puLUMA_4x16.prim  = fncdef PFX(fname ## _4x16_ ## neon); \
+    p.puLUMA_8x32.prim  = fncdef PFX(fname ## _8x32_ ## neon); \
+    p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## neon)
+#define LUMA_PU_TYPED_MULTIPLE_ARCHS_3(prim, fncdef, fname, cpu) \
+    p.puLUMA_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
+    p.puLUMA_64x64.prim = fncdef PFX(fname ## _64x64_ ## cpu); \
+    p.puLUMA_32x16.prim = fncdef PFX(fname ## _32x16_ ## cpu); \
+    p.puLUMA_64x32.prim = fncdef PFX(fname ## _64x32_ ## cpu); \
+    p.puLUMA_32x64.prim = fncdef PFX(fname ## _32x64_ ## cpu); \
+    p.puLUMA_12x16.prim = fncdef PFX(fname ## _12x16_ ## cpu); \
+    p.puLUMA_32x24.prim = fncdef PFX(fname ## _32x24_ ## cpu); \
+    p.puLUMA_24x32.prim = fncdef PFX(fname ## _24x32_ ## cpu); \
+    p.puLUMA_32x8.prim  = fncdef PFX(fname ## _32x8_ ## cpu); \
+    p.puLUMA_64x48.prim = fncdef PFX(fname ## _64x48_ ## cpu); \
+    p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## cpu); \
+    p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## cpu)
+#define LUMA_PU_TYPED_NEON_3(prim, fncdef, fname) \
+    p.puLUMA_4x4.prim   = fncdef PFX(fname ## _4x4_ ## neon); \
+    p.puLUMA_4x8.prim   = fncdef PFX(fname ## _4x8_ ## neon); \
+    p.puLUMA_4x16.prim  = fncdef PFX(fname ## _4x16_ ## neon)
+#define LUMA_PU_TYPED_CAN_USE_SVE2(prim, fncdef, fname) \
+    p.puLUMA_8x8.prim   = fncdef PFX(fname ## _8x8_ ## sve2); \
+    p.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## sve2); \
+    p.puLUMA_32x32.prim = fncdef PFX(fname ## _32x32_ ## sve2); \
+    p.puLUMA_64x64.prim = fncdef PFX(fname ## _64x64_ ## sve2); \
+    p.puLUMA_8x4.prim   = fncdef PFX(fname ## _8x4_ ## sve2); \
+    p.puLUMA_16x8.prim  = fncdef PFX(fname ## _16x8_ ## sve2); \
+    p.puLUMA_8x16.prim  = fncdef PFX(fname ## _8x16_ ## sve2); \
+    p.puLUMA_16x32.prim = fncdef PFX(fname ## _16x32_ ## sve2); \
+    p.puLUMA_32x16.prim = fncdef PFX(fname ## _32x16_ ## sve2); \
+    p.puLUMA_64x32.prim = fncdef PFX(fname ## _64x32_ ## sve2); \
+    p.puLUMA_32x64.prim = fncdef PFX(fname ## _32x64_ ## sve2); \
+    p.puLUMA_16x12.prim = fncdef PFX(fname ## _16x12_ ## sve2); \
+    p.puLUMA_12x16.prim = fncdef PFX(fname ## _12x16_ ## sve2); \
+    p.puLUMA_16x4.prim  = fncdef PFX(fname ## _16x4_ ## sve2); \
+    p.puLUMA_32x24.prim = fncdef PFX(fname ## _32x24_ ## sve2); \
+    p.puLUMA_24x32.prim = fncdef PFX(fname ## _24x32_ ## sve2); \
+    p.puLUMA_32x8.prim  = fncdef PFX(fname ## _32x8_ ## sve2); \
+    p.puLUMA_8x32.prim  = fncdef PFX(fname ## _8x32_ ## sve2); \
+    p.puLUMA_64x48.prim = fncdef PFX(fname ## _64x48_ ## sve2); \
+    p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## sve2); \
+    p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## sve2); \
+    p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## sve2)
+#define LUMA_PU_TYPED_NEON_FILTER_PIXEL_TO_SHORT(prim, fncdef) \
+    p.puLUMA_4x4.prim   = fncdef PFX(filterPixelToShort ## _4x4_ ## neon); \
+    p.puLUMA_8x8.prim   = fncdef PFX(filterPixelToShort ## _8x8_ ## neon); \
+    p.puLUMA_16x16.prim = fncdef PFX(filterPixelToShort ## _16x16_ ## neon); \
+    p.puLUMA_8x4.prim   = fncdef PFX(filterPixelToShort ## _8x4_ ## neon); \
+    p.puLUMA_4x8.prim   = fncdef PFX(filterPixelToShort ## _4x8_ ## neon); \
+    p.puLUMA_16x8.prim  = fncdef PFX(filterPixelToShort ## _16x8_ ## neon); \
+    p.puLUMA_8x16.prim  = fncdef PFX(filterPixelToShort ## _8x16_ ## neon); \
+    p.puLUMA_16x32.prim = fncdef PFX(filterPixelToShort ## _16x32_ ## neon); \
+    p.puLUMA_16x12.prim = fncdef PFX(filterPixelToShort ## _16x12_ ## neon); \
+    p.puLUMA_12x16.prim = fncdef PFX(filterPixelToShort ## _12x16_ ## neon); \
+    p.puLUMA_16x4.prim  = fncdef PFX(filterPixelToShort ## _16x4_ ## neon); \
+    p.puLUMA_4x16.prim  = fncdef PFX(filterPixelToShort ## _4x16_ ## neon); \
+    p.puLUMA_24x32.prim = fncdef PFX(filterPixelToShort ## _24x32_ ## neon); \
+    p.puLUMA_8x32.prim  = fncdef PFX(filterPixelToShort ## _8x32_ ## neon); \
+    p.puLUMA_16x64.prim = fncdef PFX(filterPixelToShort ## _16x64_ ## neon)
+#define LUMA_PU_TYPED_SVE_FILTER_PIXEL_TO_SHORT(prim, fncdef) \
+    p.puLUMA_32x32.prim = fncdef PFX(filterPixelToShort ## _32x32_ ## sve); \
+    p.puLUMA_32x16.prim = fncdef PFX(filterPixelToShort ## _32x16_ ## sve); \
+    p.puLUMA_32x64.prim = fncdef PFX(filterPixelToShort ## _32x64_ ## sve); \
+    p.puLUMA_32x24.prim = fncdef PFX(filterPixelToShort ## _32x24_ ## sve); \
+    p.puLUMA_32x8.prim  = fncdef PFX(filterPixelToShort ## _32x8_ ## sve); \
+    p.puLUMA_64x64.prim = fncdef PFX(filterPixelToShort ## _64x64_ ## sve); \
+    p.puLUMA_64x32.prim = fncdef PFX(filterPixelToShort ## _64x32_ ## sve); \
+    p.puLUMA_64x48.prim = fncdef PFX(filterPixelToShort ## _64x48_ ## sve); \
+    p.puLUMA_64x16.prim = fncdef PFX(filterPixelToShort ## _64x16_ ## sve); \
+    p.puLUMA_48x64.prim = fncdef PFX(filterPixelToShort ## _48x64_ ## sve)
​

x265_3.6.tar.gz/source/common/aarch64/asm-sve.S Added

@@ -0,0 +1,39 @@
+/*****************************************************************************
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
+ *
+ * Authors: David Chen <david.chen@myais.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.arch armv8-a+sve
+
+.macro ABS2_SVE a b c
+    abs             \a, \c\()/m, \a
+    abs             \b, \c\()/m, \b
+.endm
+
+.macro ABS8_SVE z0, z1, z2, z3, z4, z5, z6, z7, p0
+    ABS2_SVE        \z0, \z1, p0
+    ABS2_SVE        \z2, \z3, p0
+    ABS2_SVE        \z4, \z5, p0
+    ABS2_SVE        \z6, \z7, p0
+.endm
+

 
@@ -0,0 +1,39 @@
+/*****************************************************************************
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
+ *
+ * Authors: David Chen <david.chen@myais.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.arch armv8-a+sve
+
+.macro ABS2_SVE a b c
+    abs             \a, \c\()/m, \a
+    abs             \b, \c\()/m, \b
+.endm
+
+.macro ABS8_SVE z0, z1, z2, z3, z4, z5, z6, z7, p0
+    ABS2_SVE        \z0, \z1, p0
+    ABS2_SVE        \z2, \z3, p0
+    ABS2_SVE        \z4, \z5, p0
+    ABS2_SVE        \z6, \z7, p0
+.endm
+
​

x265_3.5.tar.gz/source/common/aarch64/asm.S -> x265_3.6.tar.gz/source/common/aarch64/asm.S Changed

@@ -1,7 +1,8 @@
 /*****************************************************************************
- * Copyright (C) 2020 MulticoreWare, Inc
+ * Copyright (C) 2020-2021 MulticoreWare, Inc
  *
  * Authors: Hongbin Liu <liuhongbin1@huawei.com>
+ *          Sebastian Pop <spop@amazon.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -21,34 +22,74 @@
  * For more information, contact us at license @ x265.com.
  *****************************************************************************/
 
+#ifndef ASM_S_  // #include guards
+#define ASM_S_
+
 .arch           armv8-a
 
+#define PFX3(prefix, name) prefix ## _ ## name
+#define PFX2(prefix, name) PFX3(prefix, name)
+#define PFX(name)          PFX2(X265_NS, name)
+
+#ifdef __APPLE__
+#define PREFIX 1
+#endif
+
 #ifdef PREFIX
 #define EXTERN_ASM _
+#define HAVE_AS_FUNC 0
+#elif defined __clang__
+#define EXTERN_ASM
+#define HAVE_AS_FUNC 0
+#define PREFIX 1
 #else
 #define EXTERN_ASM
+#define HAVE_AS_FUNC 1
 #endif
 
 #ifdef __ELF__
 #define ELF
 #else
+#ifdef PREFIX
+#define ELF #
+#else
 #define ELF @
 #endif
-
-#define HAVE_AS_FUNC 1
+#endif
 
 #if HAVE_AS_FUNC
 #define FUNC
 #else
+#ifdef PREFIX
+#define FUNC #
+#else
 #define FUNC @
 #endif
+#endif
+
+#define GLUE(a, b) a ## b
+#define JOIN(a, b) GLUE(a, b)
+
+#define PFX_C(name)        JOIN(JOIN(JOIN(EXTERN_ASM, X265_NS), _), name)
+
+#ifdef __APPLE__
+.macro endfunc
+ELF .size \name, . - \name
+FUNC .endfunc
+.endm
+#endif
 
 .macro function name, export=1
+#ifdef __APPLE__
+    .global \name
+    endfunc
+#else
     .macro endfunc
 ELF     .size   \name, . - \name
 FUNC    .endfunc
         .purgem endfunc
     .endm
+#endif
         .align  2
 .if \export == 1
         .global EXTERN_ASM\name
@@ -64,6 +105,83 @@
 .endif
 .endm
 
+.macro  const   name, align=2
+    .macro endconst
+ELF     .size   \name, . - \name
+        .purgem endconst
+    .endm
+#ifdef __MACH__
+    .const_data
+#else
+    .section .rodata
+#endif
+    .align          \align
+\name:
+.endm
+
+.macro  movrel rd, val, offset=0
+#if defined(__APPLE__)
+  .if \offset < 0
+        adrp            \rd, \val@PAGE
+        add             \rd, \rd, \val@PAGEOFF
+        sub             \rd, \rd, -(\offset)
+  .else
+        adrp            \rd, \val+(\offset)@PAGE
+        add             \rd, \rd, \val+(\offset)@PAGEOFF
+  .endif
+#elif defined(PIC) && defined(_WIN32)
+  .if \offset < 0
+        adrp            \rd, \val
+        add             \rd, \rd, :lo12:\val
+        sub             \rd, \rd, -(\offset)
+  .else
+        adrp            \rd, \val+(\offset)
+        add             \rd, \rd, :lo12:\val+(\offset)
+  .endif
+#else
+        adrp            \rd, \val+(\offset)
+        add             \rd, \rd, :lo12:\val+(\offset)
+#endif
+.endm
 
 #define FENC_STRIDE 64
 #define FDEC_STRIDE 32
+
+.macro SUMSUB_AB sum, diff, a, b
+    add             \sum,  \a, \b
+    sub             \diff, \a, \b
+.endm
+
+.macro SUMSUB_ABCD s1, d1, s2, d2, a, b, c, d
+    SUMSUB_AB       \s1, \d1, \a, \b
+    SUMSUB_AB       \s2, \d2, \c, \d
+.endm
+
+.macro HADAMARD4_V r1, r2, r3, r4, t1, t2, t3, t4
+    SUMSUB_ABCD     \t1, \t2, \t3, \t4, \r1, \r2, \r3, \r4
+    SUMSUB_ABCD     \r1, \r3, \r2, \r4, \t1, \t3, \t2, \t4
+.endm
+
+.macro ABS2 a b
+    abs             \a, \a
+    abs             \b, \b
+.endm
+
+.macro ABS8 v0, v1, v2, v3, v4, v5, v6, v7
+    ABS2            \v0, \v1
+    ABS2            \v2, \v3
+    ABS2            \v4, \v5
+    ABS2            \v6, \v7
+.endm
+
+.macro vtrn t1, t2, s1, s2
+    trn1            \t1, \s1, \s2
+    trn2            \t2, \s1, \s2
+.endm
+
+.macro trn4 t1, t2, t3, t4, s1, s2, s3, s4
+    vtrn            \t1, \t2, \s1, \s2
+    vtrn            \t3, \t4, \s3, \s4
+.endm
+
+#endif
\ No newline at end of file

 
@@ -1,7 +1,8 @@
 /*****************************************************************************
- * Copyright (C) 2020 MulticoreWare, Inc
+ * Copyright (C) 2020-2021 MulticoreWare, Inc
  *
  * Authors: Hongbin Liu <liuhongbin1@huawei.com>
+ *          Sebastian Pop <spop@amazon.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -21,34 +22,74 @@
  * For more information, contact us at license @ x265.com.
  *****************************************************************************/
 
+#ifndef ASM_S_  // #include guards
+#define ASM_S_
+
 .arch           armv8-a
 
+#define PFX3(prefix, name) prefix ## _ ## name
+#define PFX2(prefix, name) PFX3(prefix, name)
+#define PFX(name)          PFX2(X265_NS, name)
+
+#ifdef __APPLE__
+#define PREFIX 1
+#endif
+
 #ifdef PREFIX
 #define EXTERN_ASM _
+#define HAVE_AS_FUNC 0
+#elif defined __clang__
+#define EXTERN_ASM
+#define HAVE_AS_FUNC 0
+#define PREFIX 1
 #else
 #define EXTERN_ASM
+#define HAVE_AS_FUNC 1
 #endif
 
 #ifdef __ELF__
 #define ELF
 #else
+#ifdef PREFIX
+#define ELF #
+#else
 #define ELF @
 #endif
-
-#define HAVE_AS_FUNC 1
+#endif
 
 #if HAVE_AS_FUNC
 #define FUNC
 #else
+#ifdef PREFIX
+#define FUNC #
+#else
 #define FUNC @
 #endif
+#endif
+
+#define GLUE(a, b) a ## b
+#define JOIN(a, b) GLUE(a, b)
+
+#define PFX_C(name)        JOIN(JOIN(JOIN(EXTERN_ASM, X265_NS), _), name)
+
+#ifdef __APPLE__
+.macro endfunc
+ELF .size \name, . - \name
+FUNC .endfunc
+.endm
+#endif
 
 .macro function name, export=1
+#ifdef __APPLE__
+    .global \name
+    endfunc
+#else
     .macro endfunc
 ELF     .size   \name, . - \name
 FUNC    .endfunc
         .purgem endfunc
     .endm
+#endif
         .align  2
 .if \export == 1
         .global EXTERN_ASM\name
@@ -64,6 +105,83 @@
 .endif
 .endm
 
+.macro  const   name, align=2
+    .macro endconst
+ELF     .size   \name, . - \name
+        .purgem endconst
+    .endm
+#ifdef __MACH__
+    .const_data
+#else
+    .section .rodata
+#endif
+    .align          \align
+\name:
+.endm
+
+.macro  movrel rd, val, offset=0
+#if defined(__APPLE__)
+  .if \offset < 0
+        adrp            \rd, \val@PAGE
+        add             \rd, \rd, \val@PAGEOFF
+        sub             \rd, \rd, -(\offset)
+  .else
+        adrp            \rd, \val+(\offset)@PAGE
+        add             \rd, \rd, \val+(\offset)@PAGEOFF
+  .endif
+#elif defined(PIC) && defined(_WIN32)
+  .if \offset < 0
+        adrp            \rd, \val
+        add             \rd, \rd, :lo12:\val
+        sub             \rd, \rd, -(\offset)
+  .else
+        adrp            \rd, \val+(\offset)
+        add             \rd, \rd, :lo12:\val+(\offset)
+  .endif
+#else
+        adrp            \rd, \val+(\offset)
+        add             \rd, \rd, :lo12:\val+(\offset)
+#endif
+.endm
 
 #define FENC_STRIDE 64
 #define FDEC_STRIDE 32
+
+.macro SUMSUB_AB sum, diff, a, b
+    add             \sum,  \a, \b
+    sub             \diff, \a, \b
+.endm
+
+.macro SUMSUB_ABCD s1, d1, s2, d2, a, b, c, d
+    SUMSUB_AB       \s1, \d1, \a, \b
+    SUMSUB_AB       \s2, \d2, \c, \d
+.endm
+
+.macro HADAMARD4_V r1, r2, r3, r4, t1, t2, t3, t4
+    SUMSUB_ABCD     \t1, \t2, \t3, \t4, \r1, \r2, \r3, \r4
+    SUMSUB_ABCD     \r1, \r3, \r2, \r4, \t1, \t3, \t2, \t4
+.endm
+
+.macro ABS2 a b
+    abs             \a, \a
+    abs             \b, \b
+.endm
+
+.macro ABS8 v0, v1, v2, v3, v4, v5, v6, v7
+    ABS2            \v0, \v1
+    ABS2            \v2, \v3
+    ABS2            \v4, \v5
+    ABS2            \v6, \v7
+.endm
+
+.macro vtrn t1, t2, s1, s2
+    trn1            \t1, \s1, \s2
+    trn2            \t2, \s1, \s2
+.endm
+
+.macro trn4 t1, t2, t3, t4, s1, s2, s3, s4
+    vtrn            \t1, \t2, \s1, \s2
+    vtrn            \t3, \t4, \s3, \s4
+.endm
+
+#endif
\ No newline at end of file
​

x265_3.6.tar.gz/source/common/aarch64/blockcopy8-common.S Added

@@ -0,0 +1,54 @@
+/*****************************************************************************
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
+ *
+ * Authors: David Chen <david.chen@myais.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+// This file contains the macros written using NEON instruction set
+// that are also used by the SVE2 functions
+
+#include "asm.S"
+
+.arch           armv8-a
+
+// void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
+.macro cpy1Dto2D_shr_start
+    add             x2, x2, x2
+    dup             v0.8h, w3
+    cmeq            v1.8h, v1.8h, v1.8h
+    sshl            v1.8h, v1.8h, v0.8h
+    sri             v1.8h, v1.8h, #1
+    neg             v0.8h, v0.8h
+.endm
+
+.macro cpy2Dto1D_shr_start
+    add             x2, x2, x2
+    dup             v0.8h, w3
+    cmeq            v1.8h, v1.8h, v1.8h
+    sshl            v1.8h, v1.8h, v0.8h
+    sri             v1.8h, v1.8h, #1
+    neg             v0.8h, v0.8h
+.endm
+
+const xtn_xtn2_table, align=4
+.byte    0, 2, 4, 6, 8, 10, 12, 14
+.byte    16, 18, 20, 22, 24, 26, 28, 30
+endconst
+

 
@@ -0,0 +1,54 @@
+/*****************************************************************************
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
+ *
+ * Authors: David Chen <david.chen@myais.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+// This file contains the macros written using NEON instruction set
+// that are also used by the SVE2 functions
+
+#include "asm.S"
+
+.arch           armv8-a
+
+// void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
+.macro cpy1Dto2D_shr_start
+    add             x2, x2, x2
+    dup             v0.8h, w3
+    cmeq            v1.8h, v1.8h, v1.8h
+    sshl            v1.8h, v1.8h, v0.8h
+    sri             v1.8h, v1.8h, #1
+    neg             v0.8h, v0.8h
+.endm
+
+.macro cpy2Dto1D_shr_start
+    add             x2, x2, x2
+    dup             v0.8h, w3
+    cmeq            v1.8h, v1.8h, v1.8h
+    sshl            v1.8h, v1.8h, v0.8h
+    sri             v1.8h, v1.8h, #1
+    neg             v0.8h, v0.8h
+.endm
+
+const xtn_xtn2_table, align=4
+.byte    0, 2, 4, 6, 8, 10, 12, 14
+.byte    16, 18, 20, 22, 24, 26, 28, 30
+endconst
+
​

x265_3.6.tar.gz/source/common/aarch64/blockcopy8-sve.S Added

@@ -0,0 +1,1416 @@
+/*****************************************************************************
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
+ *
+ * Authors: David Chen <david.chen@myais.com.cn>
+ 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm-sve.S"
+#include "blockcopy8-common.S"
+
+.arch armv8-a+sve
+
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
+.section .rodata
+#endif
+
+.align 4
+
+.text
+
+/* void blockcopy_sp(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb)
+ *
+ * r0   - a
+ * r1   - stridea
+ * r2   - b
+ * r3   - strideb */
+
+function PFX(blockcopy_sp_4x4_sve)
+    ptrue           p0.h, vl4
+.rept 2
+    ld1h            {z0.h}, p0/z, x2
+    add             x2, x2, x3, lsl #1
+    st1b            {z0.h}, p0, x0
+    add             x0, x0, x1
+    ld1h            {z1.h}, p0/z, x2
+    add             x2, x2, x3, lsl #1
+    st1b            {z1.h}, p0, x0
+    add             x0, x0, x1
+.endr
+    ret
+endfunc
+
+function PFX(blockcopy_sp_8x8_sve)
+    ptrue           p0.h, vl8
+.rept 4
+    ld1h            {z0.h}, p0/z, x2
+    add             x2, x2, x3, lsl #1
+    st1b            {z0.h}, p0, x0
+    add            x0, x0, x1
+    ld1h            {z1.h}, p0/z, x2
+    add             x2, x2, x3, lsl #1
+    st1b            {z1.h}, p0, x0
+    add            x0, x0, x1
+.endr
+    ret
+endfunc
+
+function PFX(blockcopy_sp_16x16_sve)
+    rdvl            x9, #1
+    cmp             x9, #16
+    bgt             .vl_gt_16_blockcopy_sp_16_16
+    lsl             x3, x3, #1
+    movrel          x11, xtn_xtn2_table
+    ld1             {v31.16b}, x11
+.rept 8
+    ld1             {v0.8h-v1.8h}, x2, x3
+    ld1             {v2.8h-v3.8h}, x2, x3
+    tbl             v0.16b, {v0.16b,v1.16b}, v31.16b
+    tbl             v1.16b, {v2.16b,v3.16b}, v31.16b
+    st1             {v0.16b}, x0, x1
+    st1             {v1.16b}, x0, x1
+.endr
+    ret
+.vl_gt_16_blockcopy_sp_16_16:
+    ptrue           p0.h, vl16
+.rept 8
+    ld1h            {z0.h}, p0/z, x2
+    st1b            {z0.h}, p0, x0
+    add             x2, x2, x3, lsl #1
+    add             x0, x0, x1
+    ld1h            {z1.h}, p0/z, x2
+    st1b            {z1.h}, p0, x0
+    add             x2, x2, x3, lsl #1
+    add             x0, x0, x1
+.endr
+    ret
+endfunc
+
+function PFX(blockcopy_sp_32x32_sve)
+    mov             w12, #4
+    rdvl            x9, #1
+    cmp             x9, #16
+    bgt             .vl_gt_16_blockcopy_sp_32_32
+    lsl             x3, x3, #1
+    movrel          x11, xtn_xtn2_table
+    ld1             {v31.16b}, x11
+.loop_csp32_sve:
+    sub             w12, w12, #1
+.rept 4
+    ld1             {v0.8h-v3.8h}, x2, x3
+    ld1             {v4.8h-v7.8h}, x2, x3
+    tbl             v0.16b, {v0.16b,v1.16b}, v31.16b
+    tbl             v1.16b, {v2.16b,v3.16b}, v31.16b
+    tbl             v2.16b, {v4.16b,v5.16b}, v31.16b
+    tbl             v3.16b, {v6.16b,v7.16b}, v31.16b
+    st1             {v0.16b-v1.16b}, x0, x1
+    st1             {v2.16b-v3.16b}, x0, x1
+.endr
+    cbnz            w12, .loop_csp32_sve
+    ret
+.vl_gt_16_blockcopy_sp_32_32:
+    cmp             x9, #48
+    bgt             .vl_gt_48_blockcopy_sp_32_32
+    ptrue           p0.h, vl16
+.vl_gt_16_loop_csp32_sve:
+    sub             w12, w12, #1
+.rept 4
+    ld1h            {z0.h}, p0/z, x2
+    ld1h            {z1.h}, p0/z, x2, #1, mul vl
+    st1b            {z0.h}, p0, x0
+    st1b            {z1.h}, p0, x0, #1, mul vl
+    add             x2, x2, x3, lsl #1
+    add             x0, x0, x1
+    ld1h            {z2.h}, p0/z, x2
+    ld1h            {z3.h}, p0/z, x2, #1, mul vl
+    st1b            {z2.h}, p0, x0
+    st1b            {z3.h}, p0, x0, #1, mul vl
+    add             x2, x2, x3, lsl #1
+    add             x0, x0, x1
+.endr
+    cbnz            w12, .vl_gt_16_loop_csp32_sve
+    ret
+.vl_gt_48_blockcopy_sp_32_32:
+    ptrue           p0.h, vl32
+.vl_gt_48_loop_csp32_sve:
+    sub             w12, w12, #1
+.rept 4
+    ld1h            {z0.h}, p0/z, x2
+    st1b            {z0.h}, p0, x0
+    add             x2, x2, x3, lsl #1
+    add             x0, x0, x1
+    ld1h            {z1.h}, p0/z, x2
+    st1b            {z1.h}, p0, x0
+    add             x2, x2, x3, lsl #1
+    add             x0, x0, x1
+.endr
+    cbnz            w12, .vl_gt_48_loop_csp32_sve
+    ret
+endfunc
+
+function PFX(blockcopy_ps_16x16_sve)
+    rdvl            x9, #1
+    cmp             x9, #16
+    bgt             .vl_gt_16_blockcopy_ps_16_16
+    lsl             x1, x1, #1
+.rept 8
+    ld1             {v4.16b}, x2, x3
+    ld1             {v5.16b}, x2, x3
+    uxtl            v0.8h, v4.8b
+    uxtl2           v1.8h, v4.16b
+    uxtl            v2.8h, v5.8b
+    uxtl2           v3.8h, v5.16b
+    st1             {v0.8h-v1.8h}, x0, x1
+    st1             {v2.8h-v3.8h}, x0, x1
+.endr
+    ret
+.vl_gt_16_blockcopy_ps_16_16:
+    ptrue           p0.b, vl32
+.rept 16
+    ld1b            {z1.h}, p0/z, x2
+    st1h            {z1.h}, p0, x0
+    add             x0, x0, x1, lsl #1
+    add             x2, x2, x3
+.endr
+    ret
+endfunc
+
+function PFX(blockcopy_ps_32x32_sve)
+    rdvl            x9, #1
+    cmp             x9, #16
+    bgt             .vl_gt_16_blockcopy_ps_32_32

 
@@ -0,0 +1,1416 @@
+/*****************************************************************************
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
+ *
+ * Authors: David Chen <david.chen@myais.com.cn>
+ 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm-sve.S"
+#include "blockcopy8-common.S"
+
+.arch armv8-a+sve
+
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
+.section .rodata
+#endif
+
+.align 4
+
+.text
+
+/* void blockcopy_sp(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb)
+ *
+ * r0   - a
+ * r1   - stridea
+ * r2   - b
+ * r3   - strideb */
+
+function PFX(blockcopy_sp_4x4_sve)
+    ptrue           p0.h, vl4
+.rept 2
+    ld1h            {z0.h}, p0/z, x2
+    add             x2, x2, x3, lsl #1
+    st1b            {z0.h}, p0, x0
+    add             x0, x0, x1
+    ld1h            {z1.h}, p0/z, x2
+    add             x2, x2, x3, lsl #1
+    st1b            {z1.h}, p0, x0
+    add             x0, x0, x1
+.endr
+    ret
+endfunc
+
+function PFX(blockcopy_sp_8x8_sve)
+    ptrue           p0.h, vl8
+.rept 4
+    ld1h            {z0.h}, p0/z, x2
+    add             x2, x2, x3, lsl #1
+    st1b            {z0.h}, p0, x0
+    add            x0, x0, x1
+    ld1h            {z1.h}, p0/z, x2
+    add             x2, x2, x3, lsl #1
+    st1b            {z1.h}, p0, x0
+    add            x0, x0, x1
+.endr
+    ret
+endfunc
+
+function PFX(blockcopy_sp_16x16_sve)
+    rdvl            x9, #1
+    cmp             x9, #16
+    bgt             .vl_gt_16_blockcopy_sp_16_16
+    lsl             x3, x3, #1
+    movrel          x11, xtn_xtn2_table
+    ld1             {v31.16b}, x11
+.rept 8
+    ld1             {v0.8h-v1.8h}, x2, x3
+    ld1             {v2.8h-v3.8h}, x2, x3
+    tbl             v0.16b, {v0.16b,v1.16b}, v31.16b
+    tbl             v1.16b, {v2.16b,v3.16b}, v31.16b
+    st1             {v0.16b}, x0, x1
+    st1             {v1.16b}, x0, x1
+.endr
+    ret
+.vl_gt_16_blockcopy_sp_16_16:
+    ptrue           p0.h, vl16
+.rept 8
+    ld1h            {z0.h}, p0/z, x2
+    st1b            {z0.h}, p0, x0
+    add             x2, x2, x3, lsl #1
+    add             x0, x0, x1
+    ld1h            {z1.h}, p0/z, x2
+    st1b            {z1.h}, p0, x0
+    add             x2, x2, x3, lsl #1
+    add             x0, x0, x1
+.endr
+    ret
+endfunc
+
+function PFX(blockcopy_sp_32x32_sve)
+    mov             w12, #4
+    rdvl            x9, #1
+    cmp             x9, #16
+    bgt             .vl_gt_16_blockcopy_sp_32_32
+    lsl             x3, x3, #1
+    movrel          x11, xtn_xtn2_table
+    ld1             {v31.16b}, x11
+.loop_csp32_sve:
+    sub             w12, w12, #1
+.rept 4
+    ld1             {v0.8h-v3.8h}, x2, x3
+    ld1             {v4.8h-v7.8h}, x2, x3
+    tbl             v0.16b, {v0.16b,v1.16b}, v31.16b
+    tbl             v1.16b, {v2.16b,v3.16b}, v31.16b
+    tbl             v2.16b, {v4.16b,v5.16b}, v31.16b
+    tbl             v3.16b, {v6.16b,v7.16b}, v31.16b
+    st1             {v0.16b-v1.16b}, x0, x1
+    st1             {v2.16b-v3.16b}, x0, x1
+.endr
+    cbnz            w12, .loop_csp32_sve
+    ret
+.vl_gt_16_blockcopy_sp_32_32:
+    cmp             x9, #48
+    bgt             .vl_gt_48_blockcopy_sp_32_32
+    ptrue           p0.h, vl16
+.vl_gt_16_loop_csp32_sve:
+    sub             w12, w12, #1
+.rept 4
+    ld1h            {z0.h}, p0/z, x2
+    ld1h            {z1.h}, p0/z, x2, #1, mul vl
+    st1b            {z0.h}, p0, x0
+    st1b            {z1.h}, p0, x0, #1, mul vl
+    add             x2, x2, x3, lsl #1
+    add             x0, x0, x1
+    ld1h            {z2.h}, p0/z, x2
+    ld1h            {z3.h}, p0/z, x2, #1, mul vl
+    st1b            {z2.h}, p0, x0
+    st1b            {z3.h}, p0, x0, #1, mul vl
+    add             x2, x2, x3, lsl #1
+    add             x0, x0, x1
+.endr
+    cbnz            w12, .vl_gt_16_loop_csp32_sve
+    ret
+.vl_gt_48_blockcopy_sp_32_32:
+    ptrue           p0.h, vl32
+.vl_gt_48_loop_csp32_sve:
+    sub             w12, w12, #1
+.rept 4
+    ld1h            {z0.h}, p0/z, x2
+    st1b            {z0.h}, p0, x0
+    add             x2, x2, x3, lsl #1
+    add             x0, x0, x1
+    ld1h            {z1.h}, p0/z, x2
+    st1b            {z1.h}, p0, x0
+    add             x2, x2, x3, lsl #1
+    add             x0, x0, x1
+.endr
+    cbnz            w12, .vl_gt_48_loop_csp32_sve
+    ret
+endfunc
+
+function PFX(blockcopy_ps_16x16_sve)
+    rdvl            x9, #1
+    cmp             x9, #16
+    bgt             .vl_gt_16_blockcopy_ps_16_16
+    lsl             x1, x1, #1
+.rept 8
+    ld1             {v4.16b}, x2, x3
+    ld1             {v5.16b}, x2, x3
+    uxtl            v0.8h, v4.8b
+    uxtl2           v1.8h, v4.16b
+    uxtl            v2.8h, v5.8b
+    uxtl2           v3.8h, v5.16b
+    st1             {v0.8h-v1.8h}, x0, x1
+    st1             {v2.8h-v3.8h}, x0, x1
+.endr
+    ret
+.vl_gt_16_blockcopy_ps_16_16:
+    ptrue           p0.b, vl32
+.rept 16
+    ld1b            {z1.h}, p0/z, x2
+    st1h            {z1.h}, p0, x0
+    add             x0, x0, x1, lsl #1
+    add             x2, x2, x3
+.endr
+    ret
+endfunc
+
+function PFX(blockcopy_ps_32x32_sve)
+    rdvl            x9, #1
+    cmp             x9, #16
+    bgt             .vl_gt_16_blockcopy_ps_32_32
​

x265_3.6.tar.gz/source/common/aarch64/blockcopy8.S Added

@@ -0,0 +1,1299 @@
+/*****************************************************************************
+ * Copyright (C) 2021 MulticoreWare, Inc
+ *
+ * Authors: Sebastian Pop <spop@amazon.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm.S"
+#include "blockcopy8-common.S"
+
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
+.section .rodata
+#endif
+
+.align 4
+
+.text
+
+/* void blockcopy_sp(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb)
+ *
+ * r0   - a
+ * r1   - stridea
+ * r2   - b
+ * r3   - strideb */
+function PFX(blockcopy_sp_4x4_neon)
+    lsl             x3, x3, #1
+.rept 2
+    ld1             {v0.8h}, x2, x3
+    ld1             {v1.8h}, x2, x3
+    xtn             v0.8b, v0.8h
+    xtn             v1.8b, v1.8h
+    st1             {v0.s}0, x0, x1
+    st1             {v1.s}0, x0, x1
+.endr
+    ret
+endfunc
+
+function PFX(blockcopy_sp_8x8_neon)
+    lsl             x3, x3, #1
+.rept 4
+    ld1             {v0.8h}, x2, x3
+    ld1             {v1.8h}, x2, x3
+    xtn             v0.8b, v0.8h
+    xtn             v1.8b, v1.8h
+    st1             {v0.d}0, x0, x1
+    st1             {v1.d}0, x0, x1
+.endr
+    ret
+endfunc
+
+function PFX(blockcopy_sp_16x16_neon)
+    lsl             x3, x3, #1
+    movrel          x11, xtn_xtn2_table
+    ld1             {v31.16b}, x11
+.rept 8
+    ld1             {v0.8h-v1.8h}, x2, x3
+    ld1             {v2.8h-v3.8h}, x2, x3
+    tbl             v0.16b, {v0.16b,v1.16b}, v31.16b
+    tbl             v1.16b, {v2.16b,v3.16b}, v31.16b
+    st1             {v0.16b}, x0, x1
+    st1             {v1.16b}, x0, x1
+.endr
+    ret
+endfunc
+
+function PFX(blockcopy_sp_32x32_neon)
+    mov             w12, #4
+    lsl             x3, x3, #1
+    movrel          x11, xtn_xtn2_table
+    ld1             {v31.16b}, x11
+.loop_csp32:
+    sub             w12, w12, #1
+.rept 4
+    ld1             {v0.8h-v3.8h}, x2, x3
+    ld1             {v4.8h-v7.8h}, x2, x3
+    tbl             v0.16b, {v0.16b,v1.16b}, v31.16b
+    tbl             v1.16b, {v2.16b,v3.16b}, v31.16b
+    tbl             v2.16b, {v4.16b,v5.16b}, v31.16b
+    tbl             v3.16b, {v6.16b,v7.16b}, v31.16b
+    st1             {v0.16b-v1.16b}, x0, x1
+    st1             {v2.16b-v3.16b}, x0, x1
+.endr
+    cbnz            w12, .loop_csp32
+    ret
+endfunc
+
+function PFX(blockcopy_sp_64x64_neon)
+    mov             w12, #16
+    lsl             x3, x3, #1
+    sub             x3, x3, #64
+    movrel          x11, xtn_xtn2_table
+    ld1             {v31.16b}, x11
+.loop_csp64:
+    sub             w12, w12, #1
+.rept 4
+    ld1             {v0.8h-v3.8h}, x2, #64
+    ld1             {v4.8h-v7.8h}, x2, x3
+    tbl             v0.16b, {v0.16b,v1.16b}, v31.16b
+    tbl             v1.16b, {v2.16b,v3.16b}, v31.16b
+    tbl             v2.16b, {v4.16b,v5.16b}, v31.16b
+    tbl             v3.16b, {v6.16b,v7.16b}, v31.16b
+    st1             {v0.16b-v3.16b}, x0, x1
+.endr
+    cbnz            w12, .loop_csp64
+    ret
+endfunc
+
+// void blockcopy_ps(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb)
+function PFX(blockcopy_ps_4x4_neon)
+    lsl             x1, x1, #1
+.rept 2
+    ld1             {v0.8b}, x2, x3
+    ld1             {v1.8b}, x2, x3
+    uxtl            v0.8h, v0.8b
+    uxtl            v1.8h, v1.8b
+    st1             {v0.4h}, x0, x1
+    st1             {v1.4h}, x0, x1
+.endr
+    ret
+endfunc
+
+function PFX(blockcopy_ps_8x8_neon)
+    lsl             x1, x1, #1
+.rept 4
+    ld1             {v0.8b}, x2, x3
+    ld1             {v1.8b}, x2, x3
+    uxtl            v0.8h, v0.8b
+    uxtl            v1.8h, v1.8b
+    st1             {v0.8h}, x0, x1
+    st1             {v1.8h}, x0, x1
+.endr
+    ret
+endfunc
+
+function PFX(blockcopy_ps_16x16_neon)
+    lsl             x1, x1, #1
+.rept 8
+    ld1             {v4.16b}, x2, x3
+    ld1             {v5.16b}, x2, x3
+    uxtl            v0.8h, v4.8b
+    uxtl2           v1.8h, v4.16b
+    uxtl            v2.8h, v5.8b
+    uxtl2           v3.8h, v5.16b
+    st1             {v0.8h-v1.8h}, x0, x1
+    st1             {v2.8h-v3.8h}, x0, x1
+.endr
+    ret
+endfunc
+
+function PFX(blockcopy_ps_32x32_neon)
+    lsl             x1, x1, #1
+    mov             w12, #4
+.loop_cps32:
+    sub             w12, w12, #1
+.rept 4
+    ld1             {v16.16b-v17.16b}, x2, x3
+    ld1             {v18.16b-v19.16b}, x2, x3
+    uxtl            v0.8h, v16.8b
+    uxtl2           v1.8h, v16.16b
+    uxtl            v2.8h, v17.8b
+    uxtl2           v3.8h, v17.16b
+    uxtl            v4.8h, v18.8b
+    uxtl2           v5.8h, v18.16b
+    uxtl            v6.8h, v19.8b
+    uxtl2           v7.8h, v19.16b
+    st1             {v0.8h-v3.8h}, x0, x1
+    st1             {v4.8h-v7.8h}, x0, x1
+.endr
+    cbnz            w12, .loop_cps32
+    ret
+endfunc
+
+function PFX(blockcopy_ps_64x64_neon)
+    lsl             x1, x1, #1
+    sub             x1, x1, #64
+    mov             w12, #16
+.loop_cps64:
+    sub             w12, w12, #1
+.rept 4
+    ld1             {v16.16b-v19.16b}, x2, x3
+    uxtl            v0.8h, v16.8b

 
@@ -0,0 +1,1299 @@
+/*****************************************************************************
+ * Copyright (C) 2021 MulticoreWare, Inc
+ *
+ * Authors: Sebastian Pop <spop@amazon.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm.S"
+#include "blockcopy8-common.S"
+
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
+.section .rodata
+#endif
+
+.align 4
+
+.text
+
+/* void blockcopy_sp(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb)
+ *
+ * r0   - a
+ * r1   - stridea
+ * r2   - b
+ * r3   - strideb */
+function PFX(blockcopy_sp_4x4_neon)
+    lsl             x3, x3, #1
+.rept 2
+    ld1             {v0.8h}, x2, x3
+    ld1             {v1.8h}, x2, x3
+    xtn             v0.8b, v0.8h
+    xtn             v1.8b, v1.8h
+    st1             {v0.s}0, x0, x1
+    st1             {v1.s}0, x0, x1
+.endr
+    ret
+endfunc
+
+function PFX(blockcopy_sp_8x8_neon)
+    lsl             x3, x3, #1
+.rept 4
+    ld1             {v0.8h}, x2, x3
+    ld1             {v1.8h}, x2, x3
+    xtn             v0.8b, v0.8h
+    xtn             v1.8b, v1.8h
+    st1             {v0.d}0, x0, x1
+    st1             {v1.d}0, x0, x1
+.endr
+    ret
+endfunc
+
+function PFX(blockcopy_sp_16x16_neon)
+    lsl             x3, x3, #1
+    movrel          x11, xtn_xtn2_table
+    ld1             {v31.16b}, x11
+.rept 8
+    ld1             {v0.8h-v1.8h}, x2, x3
+    ld1             {v2.8h-v3.8h}, x2, x3
+    tbl             v0.16b, {v0.16b,v1.16b}, v31.16b
+    tbl             v1.16b, {v2.16b,v3.16b}, v31.16b
+    st1             {v0.16b}, x0, x1
+    st1             {v1.16b}, x0, x1
+.endr
+    ret
+endfunc
+
+function PFX(blockcopy_sp_32x32_neon)
+    mov             w12, #4
+    lsl             x3, x3, #1
+    movrel          x11, xtn_xtn2_table
+    ld1             {v31.16b}, x11
+.loop_csp32:
+    sub             w12, w12, #1
+.rept 4
+    ld1             {v0.8h-v3.8h}, x2, x3
+    ld1             {v4.8h-v7.8h}, x2, x3
+    tbl             v0.16b, {v0.16b,v1.16b}, v31.16b
+    tbl             v1.16b, {v2.16b,v3.16b}, v31.16b
+    tbl             v2.16b, {v4.16b,v5.16b}, v31.16b
+    tbl             v3.16b, {v6.16b,v7.16b}, v31.16b
+    st1             {v0.16b-v1.16b}, x0, x1
+    st1             {v2.16b-v3.16b}, x0, x1
+.endr
+    cbnz            w12, .loop_csp32
+    ret
+endfunc
+
+function PFX(blockcopy_sp_64x64_neon)
+    mov             w12, #16
+    lsl             x3, x3, #1
+    sub             x3, x3, #64
+    movrel          x11, xtn_xtn2_table
+    ld1             {v31.16b}, x11
+.loop_csp64:
+    sub             w12, w12, #1
+.rept 4
+    ld1             {v0.8h-v3.8h}, x2, #64
+    ld1             {v4.8h-v7.8h}, x2, x3
+    tbl             v0.16b, {v0.16b,v1.16b}, v31.16b
+    tbl             v1.16b, {v2.16b,v3.16b}, v31.16b
+    tbl             v2.16b, {v4.16b,v5.16b}, v31.16b
+    tbl             v3.16b, {v6.16b,v7.16b}, v31.16b
+    st1             {v0.16b-v3.16b}, x0, x1
+.endr
+    cbnz            w12, .loop_csp64
+    ret
+endfunc
+
+// void blockcopy_ps(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb)
+function PFX(blockcopy_ps_4x4_neon)
+    lsl             x1, x1, #1
+.rept 2
+    ld1             {v0.8b}, x2, x3
+    ld1             {v1.8b}, x2, x3
+    uxtl            v0.8h, v0.8b
+    uxtl            v1.8h, v1.8b
+    st1             {v0.4h}, x0, x1
+    st1             {v1.4h}, x0, x1
+.endr
+    ret
+endfunc
+
+function PFX(blockcopy_ps_8x8_neon)
+    lsl             x1, x1, #1
+.rept 4
+    ld1             {v0.8b}, x2, x3
+    ld1             {v1.8b}, x2, x3
+    uxtl            v0.8h, v0.8b
+    uxtl            v1.8h, v1.8b
+    st1             {v0.8h}, x0, x1
+    st1             {v1.8h}, x0, x1
+.endr
+    ret
+endfunc
+
+function PFX(blockcopy_ps_16x16_neon)
+    lsl             x1, x1, #1
+.rept 8
+    ld1             {v4.16b}, x2, x3
+    ld1             {v5.16b}, x2, x3
+    uxtl            v0.8h, v4.8b
+    uxtl2           v1.8h, v4.16b
+    uxtl            v2.8h, v5.8b
+    uxtl2           v3.8h, v5.16b
+    st1             {v0.8h-v1.8h}, x0, x1
+    st1             {v2.8h-v3.8h}, x0, x1
+.endr
+    ret
+endfunc
+
+function PFX(blockcopy_ps_32x32_neon)
+    lsl             x1, x1, #1
+    mov             w12, #4
+.loop_cps32:
+    sub             w12, w12, #1
+.rept 4
+    ld1             {v16.16b-v17.16b}, x2, x3
+    ld1             {v18.16b-v19.16b}, x2, x3
+    uxtl            v0.8h, v16.8b
+    uxtl2           v1.8h, v16.16b
+    uxtl            v2.8h, v17.8b
+    uxtl2           v3.8h, v17.16b
+    uxtl            v4.8h, v18.8b
+    uxtl2           v5.8h, v18.16b
+    uxtl            v6.8h, v19.8b
+    uxtl2           v7.8h, v19.16b
+    st1             {v0.8h-v3.8h}, x0, x1
+    st1             {v4.8h-v7.8h}, x0, x1
+.endr
+    cbnz            w12, .loop_cps32
+    ret
+endfunc
+
+function PFX(blockcopy_ps_64x64_neon)
+    lsl             x1, x1, #1
+    sub             x1, x1, #64
+    mov             w12, #16
+.loop_cps64:
+    sub             w12, w12, #1
+.rept 4
+    ld1             {v16.16b-v19.16b}, x2, x3
+    uxtl            v0.8h, v16.8b
​

x265_3.6.tar.gz/source/common/aarch64/dct-prim.cpp Added

@@ -0,0 +1,948 @@
+#include "dct-prim.h"
+
+
+#if HAVE_NEON
+
+#include <arm_neon.h>
+
+
+namespace
+{
+using namespace X265_NS;
+
+
+static int16x8_t rev16(const int16x8_t a)
+{
+    static const int8x16_t tbl = {14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1};
+    return vqtbx1q_u8(a, a, tbl);
+}
+
+static int32x4_t rev32(const int32x4_t a)
+{
+    static const int8x16_t tbl = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3};
+    return vqtbx1q_u8(a, a, tbl);
+}
+
+static void transpose_4x4x16(int16x4_t &x0, int16x4_t &x1, int16x4_t &x2, int16x4_t &x3)
+{
+    int16x4_t s0, s1, s2, s3;
+    s0 = vtrn1_s32(x0, x2);
+    s1 = vtrn1_s32(x1, x3);
+    s2 = vtrn2_s32(x0, x2);
+    s3 = vtrn2_s32(x1, x3);
+
+    x0 = vtrn1_s16(s0, s1);
+    x1 = vtrn2_s16(s0, s1);
+    x2 = vtrn1_s16(s2, s3);
+    x3 = vtrn2_s16(s2, s3);
+}
+
+
+
+static int scanPosLast_opt(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag,
+                           uint8_t *coeffNum, int numSig, const uint16_t * /*scanCG4x4*/, const int /*trSize*/)
+{
+
+    // This is an optimized function for scanPosLast, which removes the rmw dependency, once integrated into mainline x265, should replace reference implementation
+    // For clarity, left the original reference code in comments
+    int scanPosLast = 0;
+
+    uint16_t cSign = 0;
+    uint16_t cFlag = 0;
+    uint8_t cNum = 0;
+
+    uint32_t prevcgIdx = 0;
+    do
+    {
+        const uint32_t cgIdx = (uint32_t)scanPosLast >> MLS_CG_SIZE;
+
+        const uint32_t posLast = scanscanPosLast;
+
+        const int curCoeff = coeffposLast;
+        const uint32_t isNZCoeff = (curCoeff != 0);
+        /*
+        NOTE: the new algorithm is complicated, so I keep reference code here
+        uint32_t posy   = posLast >> log2TrSize;
+        uint32_t posx   = posLast - (posy << log2TrSize);
+        uint32_t blkIdx0 = ((posy >> MLS_CG_LOG2_SIZE) << codingParameters.log2TrSizeCG) + (posx >> MLS_CG_LOG2_SIZE);
+        const uint32_t blkIdx = ((posLast >> (2 * MLS_CG_LOG2_SIZE)) & ~maskPosXY) + ((posLast >> MLS_CG_LOG2_SIZE) & maskPosXY);
+        sigCoeffGroupFlag64 |= ((uint64_t)isNZCoeff << blkIdx);
+        */
+
+        // get L1 sig map
+        numSig -= isNZCoeff;
+
+        if (scanPosLast % (1 << MLS_CG_SIZE) == 0)
+        {
+            coeffSignprevcgIdx = cSign;
+            coeffFlagprevcgIdx = cFlag;
+            coeffNumprevcgIdx = cNum;
+            cSign = 0;
+            cFlag = 0;
+            cNum = 0;
+        }
+        // TODO: optimize by instruction BTS
+        cSign += (uint16_t)(((curCoeff < 0) ? 1 : 0) << cNum);
+        cFlag = (cFlag << 1) + (uint16_t)isNZCoeff;
+        cNum += (uint8_t)isNZCoeff;
+        prevcgIdx = cgIdx;
+        scanPosLast++;
+    }
+    while (numSig > 0);
+
+    coeffSignprevcgIdx = cSign;
+    coeffFlagprevcgIdx = cFlag;
+    coeffNumprevcgIdx = cNum;
+    return scanPosLast - 1;
+}
+
+
+#if (MLS_CG_SIZE == 4)
+template<int log2TrSize>
+static void nonPsyRdoQuant_neon(int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost,
+                                int64_t *totalRdCost, uint32_t blkPos)
+{
+    const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH -
+                               log2TrSize; /* Represents scaling through forward transform */
+    const int scaleBits = SCALE_BITS - 2 * transformShift;
+    const uint32_t trSize = 1 << log2TrSize;
+
+    int64x2_t vcost_sum_0 = vdupq_n_s64(0);
+    int64x2_t vcost_sum_1 = vdupq_n_s64(0);
+    for (int y = 0; y < MLS_CG_SIZE; y++)
+    {
+        int16x4_t in = *(int16x4_t *)&m_resiDctCoeffblkPos;
+        int32x4_t mul = vmull_s16(in, in);
+        int64x2_t cost0, cost1;
+        cost0 = vshll_n_s32(vget_low_s32(mul), scaleBits);
+        cost1 = vshll_high_n_s32(mul, scaleBits);
+        *(int64x2_t *)&costUncodedblkPos + 0 = cost0;
+        *(int64x2_t *)&costUncodedblkPos + 2 = cost1;
+        vcost_sum_0 = vaddq_s64(vcost_sum_0, cost0);
+        vcost_sum_1 = vaddq_s64(vcost_sum_1, cost1);
+        blkPos += trSize;
+    }
+    int64_t sum = vaddvq_s64(vaddq_s64(vcost_sum_0, vcost_sum_1));
+    *totalUncodedCost += sum;
+    *totalRdCost += sum;
+}
+
+template<int log2TrSize>
+static void psyRdoQuant_neon(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded,
+                             int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos)
+{
+    const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH -
+                               log2TrSize; /* Represents scaling through forward transform */
+    const int scaleBits = SCALE_BITS - 2 * transformShift;
+    const uint32_t trSize = 1 << log2TrSize;
+    //using preprocessor to bypass clang bug
+    const int max = X265_MAX(0, (2 * transformShift + 1));
+
+    int64x2_t vcost_sum_0 = vdupq_n_s64(0);
+    int64x2_t vcost_sum_1 = vdupq_n_s64(0);
+    int32x4_t vpsy = vdupq_n_s32(*psyScale);
+    for (int y = 0; y < MLS_CG_SIZE; y++)
+    {
+        int32x4_t signCoef = vmovl_s16(*(int16x4_t *)&m_resiDctCoeffblkPos);
+        int32x4_t predictedCoef = vsubq_s32(vmovl_s16(*(int16x4_t *)&m_fencDctCoeffblkPos), signCoef);
+        int64x2_t cost0, cost1;
+        cost0 = vmull_s32(vget_low_s32(signCoef), vget_low_s32(signCoef));
+        cost1 = vmull_high_s32(signCoef, signCoef);
+        cost0 = vshlq_n_s64(cost0, scaleBits);
+        cost1 = vshlq_n_s64(cost1, scaleBits);
+        int64x2_t neg0 = vmull_s32(vget_low_s32(predictedCoef), vget_low_s32(vpsy));
+        int64x2_t neg1 = vmull_high_s32(predictedCoef, vpsy);
+        if (max > 0)
+        {
+            int64x2_t shift = vdupq_n_s64(-max);
+            neg0 = vshlq_s64(neg0, shift);
+            neg1 = vshlq_s64(neg1, shift);
+        }
+        cost0 = vsubq_s64(cost0, neg0);
+        cost1 = vsubq_s64(cost1, neg1);
+        *(int64x2_t *)&costUncodedblkPos + 0 = cost0;
+        *(int64x2_t *)&costUncodedblkPos + 2 = cost1;
+        vcost_sum_0 = vaddq_s64(vcost_sum_0, cost0);
+        vcost_sum_1 = vaddq_s64(vcost_sum_1, cost1);
+
+        blkPos += trSize;
+    }
+    int64_t sum = vaddvq_s64(vaddq_s64(vcost_sum_0, vcost_sum_1));
+    *totalUncodedCost += sum;
+    *totalRdCost += sum;
+}
+
+#else
+#error "MLS_CG_SIZE must be 4 for neon version"
+#endif
+
+
+
+template<int trSize>
+int  count_nonzero_neon(const int16_t *quantCoeff)
+{
+    X265_CHECK(((intptr_t)quantCoeff & 15) == 0, "quant buffer not aligned\n");
+    int count = 0;
+    int16x8_t vcount = vdupq_n_s16(0);
+    const int numCoeff = trSize * trSize;
+    int i = 0;
+    for (; (i + 8) <= numCoeff; i += 8)
+    {
+        int16x8_t in = *(int16x8_t *)&quantCoeffi;
+        vcount = vaddq_s16(vcount, vtstq_s16(in, in));
+    }
+    for (; i < numCoeff; i++)
+    {
+        count += quantCoeffi != 0;
+    }
+
+    return count - vaddvq_s16(vcount);

 
@@ -0,0 +1,948 @@
+#include "dct-prim.h"
+
+
+#if HAVE_NEON
+
+#include <arm_neon.h>
+
+
+namespace
+{
+using namespace X265_NS;
+
+
+static int16x8_t rev16(const int16x8_t a)
+{
+    static const int8x16_t tbl = {14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1};
+    return vqtbx1q_u8(a, a, tbl);
+}
+
+static int32x4_t rev32(const int32x4_t a)
+{
+    static const int8x16_t tbl = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3};
+    return vqtbx1q_u8(a, a, tbl);
+}
+
+static void transpose_4x4x16(int16x4_t &x0, int16x4_t &x1, int16x4_t &x2, int16x4_t &x3)
+{
+    int16x4_t s0, s1, s2, s3;
+    s0 = vtrn1_s32(x0, x2);
+    s1 = vtrn1_s32(x1, x3);
+    s2 = vtrn2_s32(x0, x2);
+    s3 = vtrn2_s32(x1, x3);
+
+    x0 = vtrn1_s16(s0, s1);
+    x1 = vtrn2_s16(s0, s1);
+    x2 = vtrn1_s16(s2, s3);
+    x3 = vtrn2_s16(s2, s3);
+}
+
+
+
+static int scanPosLast_opt(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag,
+                           uint8_t *coeffNum, int numSig, const uint16_t * /*scanCG4x4*/, const int /*trSize*/)
+{
+
+    // This is an optimized function for scanPosLast, which removes the rmw dependency, once integrated into mainline x265, should replace reference implementation
+    // For clarity, left the original reference code in comments
+    int scanPosLast = 0;
+
+    uint16_t cSign = 0;
+    uint16_t cFlag = 0;
+    uint8_t cNum = 0;
+
+    uint32_t prevcgIdx = 0;
+    do
+    {
+        const uint32_t cgIdx = (uint32_t)scanPosLast >> MLS_CG_SIZE;
+
+        const uint32_t posLast = scanscanPosLast;
+
+        const int curCoeff = coeffposLast;
+        const uint32_t isNZCoeff = (curCoeff != 0);
+        /*
+        NOTE: the new algorithm is complicated, so I keep reference code here
+        uint32_t posy   = posLast >> log2TrSize;
+        uint32_t posx   = posLast - (posy << log2TrSize);
+        uint32_t blkIdx0 = ((posy >> MLS_CG_LOG2_SIZE) << codingParameters.log2TrSizeCG) + (posx >> MLS_CG_LOG2_SIZE);
+        const uint32_t blkIdx = ((posLast >> (2 * MLS_CG_LOG2_SIZE)) & ~maskPosXY) + ((posLast >> MLS_CG_LOG2_SIZE) & maskPosXY);
+        sigCoeffGroupFlag64 |= ((uint64_t)isNZCoeff << blkIdx);
+        */
+
+        // get L1 sig map
+        numSig -= isNZCoeff;
+
+        if (scanPosLast % (1 << MLS_CG_SIZE) == 0)
+        {
+            coeffSignprevcgIdx = cSign;
+            coeffFlagprevcgIdx = cFlag;
+            coeffNumprevcgIdx = cNum;
+            cSign = 0;
+            cFlag = 0;
+            cNum = 0;
+        }
+        // TODO: optimize by instruction BTS
+        cSign += (uint16_t)(((curCoeff < 0) ? 1 : 0) << cNum);
+        cFlag = (cFlag << 1) + (uint16_t)isNZCoeff;
+        cNum += (uint8_t)isNZCoeff;
+        prevcgIdx = cgIdx;
+        scanPosLast++;
+    }
+    while (numSig > 0);
+
+    coeffSignprevcgIdx = cSign;
+    coeffFlagprevcgIdx = cFlag;
+    coeffNumprevcgIdx = cNum;
+    return scanPosLast - 1;
+}
+
+
+#if (MLS_CG_SIZE == 4)
+template<int log2TrSize>
+static void nonPsyRdoQuant_neon(int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost,
+                                int64_t *totalRdCost, uint32_t blkPos)
+{
+    const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH -
+                               log2TrSize; /* Represents scaling through forward transform */
+    const int scaleBits = SCALE_BITS - 2 * transformShift;
+    const uint32_t trSize = 1 << log2TrSize;
+
+    int64x2_t vcost_sum_0 = vdupq_n_s64(0);
+    int64x2_t vcost_sum_1 = vdupq_n_s64(0);
+    for (int y = 0; y < MLS_CG_SIZE; y++)
+    {
+        int16x4_t in = *(int16x4_t *)&m_resiDctCoeffblkPos;
+        int32x4_t mul = vmull_s16(in, in);
+        int64x2_t cost0, cost1;
+        cost0 = vshll_n_s32(vget_low_s32(mul), scaleBits);
+        cost1 = vshll_high_n_s32(mul, scaleBits);
+        *(int64x2_t *)&costUncodedblkPos + 0 = cost0;
+        *(int64x2_t *)&costUncodedblkPos + 2 = cost1;
+        vcost_sum_0 = vaddq_s64(vcost_sum_0, cost0);
+        vcost_sum_1 = vaddq_s64(vcost_sum_1, cost1);
+        blkPos += trSize;
+    }
+    int64_t sum = vaddvq_s64(vaddq_s64(vcost_sum_0, vcost_sum_1));
+    *totalUncodedCost += sum;
+    *totalRdCost += sum;
+}
+
+template<int log2TrSize>
+static void psyRdoQuant_neon(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded,
+                             int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos)
+{
+    const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH -
+                               log2TrSize; /* Represents scaling through forward transform */
+    const int scaleBits = SCALE_BITS - 2 * transformShift;
+    const uint32_t trSize = 1 << log2TrSize;
+    //using preprocessor to bypass clang bug
+    const int max = X265_MAX(0, (2 * transformShift + 1));
+
+    int64x2_t vcost_sum_0 = vdupq_n_s64(0);
+    int64x2_t vcost_sum_1 = vdupq_n_s64(0);
+    int32x4_t vpsy = vdupq_n_s32(*psyScale);
+    for (int y = 0; y < MLS_CG_SIZE; y++)
+    {
+        int32x4_t signCoef = vmovl_s16(*(int16x4_t *)&m_resiDctCoeffblkPos);
+        int32x4_t predictedCoef = vsubq_s32(vmovl_s16(*(int16x4_t *)&m_fencDctCoeffblkPos), signCoef);
+        int64x2_t cost0, cost1;
+        cost0 = vmull_s32(vget_low_s32(signCoef), vget_low_s32(signCoef));
+        cost1 = vmull_high_s32(signCoef, signCoef);
+        cost0 = vshlq_n_s64(cost0, scaleBits);
+        cost1 = vshlq_n_s64(cost1, scaleBits);
+        int64x2_t neg0 = vmull_s32(vget_low_s32(predictedCoef), vget_low_s32(vpsy));
+        int64x2_t neg1 = vmull_high_s32(predictedCoef, vpsy);
+        if (max > 0)
+        {
+            int64x2_t shift = vdupq_n_s64(-max);
+            neg0 = vshlq_s64(neg0, shift);
+            neg1 = vshlq_s64(neg1, shift);
+        }
+        cost0 = vsubq_s64(cost0, neg0);
+        cost1 = vsubq_s64(cost1, neg1);
+        *(int64x2_t *)&costUncodedblkPos + 0 = cost0;
+        *(int64x2_t *)&costUncodedblkPos + 2 = cost1;
+        vcost_sum_0 = vaddq_s64(vcost_sum_0, cost0);
+        vcost_sum_1 = vaddq_s64(vcost_sum_1, cost1);
+
+        blkPos += trSize;
+    }
+    int64_t sum = vaddvq_s64(vaddq_s64(vcost_sum_0, vcost_sum_1));
+    *totalUncodedCost += sum;
+    *totalRdCost += sum;
+}
+
+#else
+#error "MLS_CG_SIZE must be 4 for neon version"
+#endif
+
+
+
+template<int trSize>
+int  count_nonzero_neon(const int16_t *quantCoeff)
+{
+    X265_CHECK(((intptr_t)quantCoeff & 15) == 0, "quant buffer not aligned\n");
+    int count = 0;
+    int16x8_t vcount = vdupq_n_s16(0);
+    const int numCoeff = trSize * trSize;
+    int i = 0;
+    for (; (i + 8) <= numCoeff; i += 8)
+    {
+        int16x8_t in = *(int16x8_t *)&quantCoeffi;
+        vcount = vaddq_s16(vcount, vtstq_s16(in, in));
+    }
+    for (; i < numCoeff; i++)
+    {
+        count += quantCoeffi != 0;
+    }
+
+    return count - vaddvq_s16(vcount);
​

x265_3.6.tar.gz/source/common/aarch64/dct-prim.h Added

 
@@ -0,0 +1,19 @@
+#ifndef __DCT_PRIM_NEON_H__
+#define __DCT_PRIM_NEON_H__
+
+
+#include "common.h"
+#include "primitives.h"
+#include "contexts.h"   // costCoeffNxN_c
+#include "threading.h"  // CLZ
+
+namespace X265_NS
+{
+// x265 private namespace
+void setupDCTPrimitives_neon(EncoderPrimitives &p);
+};
+
+
+
+#endif
+
​

x265_3.6.tar.gz/source/common/aarch64/filter-prim.cpp Added

@@ -0,0 +1,995 @@
+#if HAVE_NEON
+
+#include "filter-prim.h"
+#include <arm_neon.h>
+
+namespace
+{
+
+using namespace X265_NS;
+
+
+template<int width, int height>
+void filterPixelToShort_neon(const pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride)
+{
+    const int shift = IF_INTERNAL_PREC - X265_DEPTH;
+    int row, col;
+    const int16x8_t off = vdupq_n_s16(IF_INTERNAL_OFFS);
+    for (row = 0; row < height; row++)
+    {
+
+        for (col = 0; col < width; col += 8)
+        {
+            int16x8_t in;
+
+#if HIGH_BIT_DEPTH
+            in = *(int16x8_t *)&srccol;
+#else
+            in = vmovl_u8(*(uint8x8_t *)&srccol);
+#endif
+
+            int16x8_t tmp = vshlq_n_s16(in, shift);
+            tmp = vsubq_s16(tmp, off);
+            *(int16x8_t *)&dstcol = tmp;
+
+        }
+
+        src += srcStride;
+        dst += dstStride;
+    }
+}
+
+
+template<int N, int width, int height>
+void interp_horiz_pp_neon(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+{
+    const int16_t *coeff = (N == 4) ? g_chromaFiltercoeffIdx : g_lumaFiltercoeffIdx;
+    int headRoom = IF_FILTER_PREC;
+    int offset = (1 << (headRoom - 1));
+    uint16_t maxVal = (1 << X265_DEPTH) - 1;
+    int cStride = 1;
+
+    src -= (N / 2 - 1) * cStride;
+    int16x8_t vc;
+    vc = *(int16x8_t *)coeff;
+    int16x4_t low_vc = vget_low_s16(vc);
+    int16x4_t high_vc = vget_high_s16(vc);
+
+    const int32x4_t voffset = vdupq_n_s32(offset);
+    const int32x4_t vhr = vdupq_n_s32(-headRoom);
+
+    int row, col;
+    for (row = 0; row < height; row++)
+    {
+        for (col = 0; col < width; col += 8)
+        {
+            int32x4_t vsum1, vsum2;
+
+            int16x8_t inputN;
+
+            for (int i = 0; i < N; i++)
+            {
+#if HIGH_BIT_DEPTH
+                inputi = *(int16x8_t *)&srccol + i;
+#else
+                inputi = vmovl_u8(*(uint8x8_t *)&srccol + i);
+#endif
+            }
+            vsum1 = voffset;
+            vsum2 = voffset;
+
+            vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input0), low_vc, 0);
+            vsum2 = vmlal_high_lane_s16(vsum2, input0, low_vc, 0);
+
+            vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input1), low_vc, 1);
+            vsum2 = vmlal_high_lane_s16(vsum2, input1, low_vc, 1);
+
+            vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input2), low_vc, 2);
+            vsum2 = vmlal_high_lane_s16(vsum2, input2, low_vc, 2);
+
+            vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input3), low_vc, 3);
+            vsum2 = vmlal_high_lane_s16(vsum2, input3, low_vc, 3);
+
+            if (N == 8)
+            {
+                vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input4), high_vc, 0);
+                vsum2 = vmlal_high_lane_s16(vsum2, input4, high_vc, 0);
+                vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input5), high_vc, 1);
+                vsum2 = vmlal_high_lane_s16(vsum2, input5, high_vc, 1);
+                vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input6), high_vc, 2);
+                vsum2 = vmlal_high_lane_s16(vsum2, input6, high_vc, 2);
+                vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input7), high_vc, 3);
+                vsum2 = vmlal_high_lane_s16(vsum2, input7, high_vc, 3);
+
+            }
+
+            vsum1 = vshlq_s32(vsum1, vhr);
+            vsum2 = vshlq_s32(vsum2, vhr);
+
+            int16x8_t vsum = vuzp1q_s16(vsum1, vsum2);
+            vsum = vminq_s16(vsum, vdupq_n_s16(maxVal));
+            vsum = vmaxq_s16(vsum, vdupq_n_s16(0));
+#if HIGH_BIT_DEPTH
+            *(int16x8_t *)&dstcol = vsum;
+#else
+            uint8x16_t usum = vuzp1q_u8(vsum, vsum);
+            *(uint8x8_t *)&dstcol = vget_low_u8(usum);
+#endif
+
+        }
+
+        src += srcStride;
+        dst += dstStride;
+    }
+}
+
+#if HIGH_BIT_DEPTH
+
+template<int N, int width, int height>
+void interp_horiz_ps_neon(const uint16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx,
+                          int isRowExt)
+{
+    const int16_t *coeff = (N == 4) ? g_chromaFiltercoeffIdx : g_lumaFiltercoeffIdx;
+    const int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
+    const int shift = IF_FILTER_PREC - headRoom;
+    const int offset = (unsigned) - IF_INTERNAL_OFFS << shift;
+
+    int blkheight = height;
+    src -= N / 2 - 1;
+
+    if (isRowExt)
+    {
+        src -= (N / 2 - 1) * srcStride;
+        blkheight += N - 1;
+    }
+    int16x8_t vc3 = vld1q_s16(coeff);
+    const int32x4_t voffset = vdupq_n_s32(offset);
+    const int32x4_t vhr = vdupq_n_s32(-shift);
+
+    int row, col;
+    for (row = 0; row < blkheight; row++)
+    {
+        for (col = 0; col < width; col += 8)
+        {
+            int32x4_t vsum, vsum2;
+
+            int16x8_t inputN;
+            for (int i = 0; i < N; i++)
+            {
+                inputi = vld1q_s16((int16_t *)&srccol + i);
+            }
+
+            vsum = voffset;
+            vsum2 = voffset;
+
+            vsum = vmlal_lane_s16(vsum, vget_low_u16(input0), vget_low_s16(vc3), 0);
+            vsum2 = vmlal_high_lane_s16(vsum2, input0, vget_low_s16(vc3), 0);
+
+            vsum = vmlal_lane_s16(vsum, vget_low_u16(input1), vget_low_s16(vc3), 1);
+            vsum2 = vmlal_high_lane_s16(vsum2, input1, vget_low_s16(vc3), 1);
+
+            vsum = vmlal_lane_s16(vsum, vget_low_u16(input2), vget_low_s16(vc3), 2);
+            vsum2 = vmlal_high_lane_s16(vsum2, input2, vget_low_s16(vc3), 2);
+
+            vsum = vmlal_lane_s16(vsum, vget_low_u16(input3), vget_low_s16(vc3), 3);
+            vsum2 = vmlal_high_lane_s16(vsum2, input3, vget_low_s16(vc3), 3);
+
+            if (N == 8)
+            {
+                vsum = vmlal_lane_s16(vsum, vget_low_s16(input4), vget_high_s16(vc3), 0);
+                vsum2 = vmlal_high_lane_s16(vsum2, input4, vget_high_s16(vc3), 0);
+
+                vsum = vmlal_lane_s16(vsum, vget_low_s16(input5), vget_high_s16(vc3), 1);
+                vsum2 = vmlal_high_lane_s16(vsum2, input5, vget_high_s16(vc3), 1);
+
+                vsum = vmlal_lane_s16(vsum, vget_low_s16(input6), vget_high_s16(vc3), 2);
+                vsum2 = vmlal_high_lane_s16(vsum2, input6, vget_high_s16(vc3), 2);
+
+                vsum = vmlal_lane_s16(vsum, vget_low_s16(input7), vget_high_s16(vc3), 3);
+                vsum2 = vmlal_high_lane_s16(vsum2, input7, vget_high_s16(vc3), 3);
+            }
+
+            vsum = vshlq_s32(vsum, vhr);
+            vsum2 = vshlq_s32(vsum2, vhr);
+            *(int16x4_t *)&dstcol = vmovn_u32(vsum);
+            *(int16x4_t *)&dstcol+4 = vmovn_u32(vsum2);
+        }
+
+        src += srcStride;
+        dst += dstStride;

 
@@ -0,0 +1,995 @@
+#if HAVE_NEON
+
+#include "filter-prim.h"
+#include <arm_neon.h>
+
+namespace
+{
+
+using namespace X265_NS;
+
+
+template<int width, int height>
+void filterPixelToShort_neon(const pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride)
+{
+    const int shift = IF_INTERNAL_PREC - X265_DEPTH;
+    int row, col;
+    const int16x8_t off = vdupq_n_s16(IF_INTERNAL_OFFS);
+    for (row = 0; row < height; row++)
+    {
+
+        for (col = 0; col < width; col += 8)
+        {
+            int16x8_t in;
+
+#if HIGH_BIT_DEPTH
+            in = *(int16x8_t *)&srccol;
+#else
+            in = vmovl_u8(*(uint8x8_t *)&srccol);
+#endif
+
+            int16x8_t tmp = vshlq_n_s16(in, shift);
+            tmp = vsubq_s16(tmp, off);
+            *(int16x8_t *)&dstcol = tmp;
+
+        }
+
+        src += srcStride;
+        dst += dstStride;
+    }
+}
+
+
+template<int N, int width, int height>
+void interp_horiz_pp_neon(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+{
+    const int16_t *coeff = (N == 4) ? g_chromaFiltercoeffIdx : g_lumaFiltercoeffIdx;
+    int headRoom = IF_FILTER_PREC;
+    int offset = (1 << (headRoom - 1));
+    uint16_t maxVal = (1 << X265_DEPTH) - 1;
+    int cStride = 1;
+
+    src -= (N / 2 - 1) * cStride;
+    int16x8_t vc;
+    vc = *(int16x8_t *)coeff;
+    int16x4_t low_vc = vget_low_s16(vc);
+    int16x4_t high_vc = vget_high_s16(vc);
+
+    const int32x4_t voffset = vdupq_n_s32(offset);
+    const int32x4_t vhr = vdupq_n_s32(-headRoom);
+
+    int row, col;
+    for (row = 0; row < height; row++)
+    {
+        for (col = 0; col < width; col += 8)
+        {
+            int32x4_t vsum1, vsum2;
+
+            int16x8_t inputN;
+
+            for (int i = 0; i < N; i++)
+            {
+#if HIGH_BIT_DEPTH
+                inputi = *(int16x8_t *)&srccol + i;
+#else
+                inputi = vmovl_u8(*(uint8x8_t *)&srccol + i);
+#endif
+            }
+            vsum1 = voffset;
+            vsum2 = voffset;
+
+            vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input0), low_vc, 0);
+            vsum2 = vmlal_high_lane_s16(vsum2, input0, low_vc, 0);
+
+            vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input1), low_vc, 1);
+            vsum2 = vmlal_high_lane_s16(vsum2, input1, low_vc, 1);
+
+            vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input2), low_vc, 2);
+            vsum2 = vmlal_high_lane_s16(vsum2, input2, low_vc, 2);
+
+            vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input3), low_vc, 3);
+            vsum2 = vmlal_high_lane_s16(vsum2, input3, low_vc, 3);
+
+            if (N == 8)
+            {
+                vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input4), high_vc, 0);
+                vsum2 = vmlal_high_lane_s16(vsum2, input4, high_vc, 0);
+                vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input5), high_vc, 1);
+                vsum2 = vmlal_high_lane_s16(vsum2, input5, high_vc, 1);
+                vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input6), high_vc, 2);
+                vsum2 = vmlal_high_lane_s16(vsum2, input6, high_vc, 2);
+                vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input7), high_vc, 3);
+                vsum2 = vmlal_high_lane_s16(vsum2, input7, high_vc, 3);
+
+            }
+
+            vsum1 = vshlq_s32(vsum1, vhr);
+            vsum2 = vshlq_s32(vsum2, vhr);
+
+            int16x8_t vsum = vuzp1q_s16(vsum1, vsum2);
+            vsum = vminq_s16(vsum, vdupq_n_s16(maxVal));
+            vsum = vmaxq_s16(vsum, vdupq_n_s16(0));
+#if HIGH_BIT_DEPTH
+            *(int16x8_t *)&dstcol = vsum;
+#else
+            uint8x16_t usum = vuzp1q_u8(vsum, vsum);
+            *(uint8x8_t *)&dstcol = vget_low_u8(usum);
+#endif
+
+        }
+
+        src += srcStride;
+        dst += dstStride;
+    }
+}
+
+#if HIGH_BIT_DEPTH
+
+template<int N, int width, int height>
+void interp_horiz_ps_neon(const uint16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx,
+                          int isRowExt)
+{
+    const int16_t *coeff = (N == 4) ? g_chromaFiltercoeffIdx : g_lumaFiltercoeffIdx;
+    const int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
+    const int shift = IF_FILTER_PREC - headRoom;
+    const int offset = (unsigned) - IF_INTERNAL_OFFS << shift;
+
+    int blkheight = height;
+    src -= N / 2 - 1;
+
+    if (isRowExt)
+    {
+        src -= (N / 2 - 1) * srcStride;
+        blkheight += N - 1;
+    }
+    int16x8_t vc3 = vld1q_s16(coeff);
+    const int32x4_t voffset = vdupq_n_s32(offset);
+    const int32x4_t vhr = vdupq_n_s32(-shift);
+
+    int row, col;
+    for (row = 0; row < blkheight; row++)
+    {
+        for (col = 0; col < width; col += 8)
+        {
+            int32x4_t vsum, vsum2;
+
+            int16x8_t inputN;
+            for (int i = 0; i < N; i++)
+            {
+                inputi = vld1q_s16((int16_t *)&srccol + i);
+            }
+
+            vsum = voffset;
+            vsum2 = voffset;
+
+            vsum = vmlal_lane_s16(vsum, vget_low_u16(input0), vget_low_s16(vc3), 0);
+            vsum2 = vmlal_high_lane_s16(vsum2, input0, vget_low_s16(vc3), 0);
+
+            vsum = vmlal_lane_s16(vsum, vget_low_u16(input1), vget_low_s16(vc3), 1);
+            vsum2 = vmlal_high_lane_s16(vsum2, input1, vget_low_s16(vc3), 1);
+
+            vsum = vmlal_lane_s16(vsum, vget_low_u16(input2), vget_low_s16(vc3), 2);
+            vsum2 = vmlal_high_lane_s16(vsum2, input2, vget_low_s16(vc3), 2);
+
+            vsum = vmlal_lane_s16(vsum, vget_low_u16(input3), vget_low_s16(vc3), 3);
+            vsum2 = vmlal_high_lane_s16(vsum2, input3, vget_low_s16(vc3), 3);
+
+            if (N == 8)
+            {
+                vsum = vmlal_lane_s16(vsum, vget_low_s16(input4), vget_high_s16(vc3), 0);
+                vsum2 = vmlal_high_lane_s16(vsum2, input4, vget_high_s16(vc3), 0);
+
+                vsum = vmlal_lane_s16(vsum, vget_low_s16(input5), vget_high_s16(vc3), 1);
+                vsum2 = vmlal_high_lane_s16(vsum2, input5, vget_high_s16(vc3), 1);
+
+                vsum = vmlal_lane_s16(vsum, vget_low_s16(input6), vget_high_s16(vc3), 2);
+                vsum2 = vmlal_high_lane_s16(vsum2, input6, vget_high_s16(vc3), 2);
+
+                vsum = vmlal_lane_s16(vsum, vget_low_s16(input7), vget_high_s16(vc3), 3);
+                vsum2 = vmlal_high_lane_s16(vsum2, input7, vget_high_s16(vc3), 3);
+            }
+
+            vsum = vshlq_s32(vsum, vhr);
+            vsum2 = vshlq_s32(vsum2, vhr);
+            *(int16x4_t *)&dstcol = vmovn_u32(vsum);
+            *(int16x4_t *)&dstcol+4 = vmovn_u32(vsum2);
+        }
+
+        src += srcStride;
+        dst += dstStride;
​

x265_3.6.tar.gz/source/common/aarch64/filter-prim.h Added

 
@@ -0,0 +1,21 @@
+#ifndef _FILTER_PRIM_ARM64_H__
+#define _FILTER_PRIM_ARM64_H__
+
+
+#include "common.h"
+#include "slicetype.h"      // LOWRES_COST_MASK
+#include "primitives.h"
+#include "x265.h"
+
+
+namespace X265_NS
+{
+
+
+void setupFilterPrimitives_neon(EncoderPrimitives &p);
+
+};
+
+
+#endif
+
​

x265_3.6.tar.gz/source/common/aarch64/fun-decls.h Added

@@ -0,0 +1,256 @@
+/*****************************************************************************
+ * Copyright (C) 2021 MulticoreWare, Inc
+ *
+ * Authors: Sebastian Pop <spop@amazon.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#define FUNCDEF_TU(ret, name, cpu, ...) \
+    ret PFX(name ## _4x4_ ## cpu(__VA_ARGS__)); \
+    ret PFX(name ## _8x8_ ## cpu(__VA_ARGS__)); \
+    ret PFX(name ## _16x16_ ## cpu(__VA_ARGS__)); \
+    ret PFX(name ## _32x32_ ## cpu(__VA_ARGS__)); \
+    ret PFX(name ## _64x64_ ## cpu(__VA_ARGS__))
+
+#define FUNCDEF_TU_S(ret, name, cpu, ...) \
+    ret PFX(name ## _4_ ## cpu(__VA_ARGS__)); \
+    ret PFX(name ## _8_ ## cpu(__VA_ARGS__)); \
+    ret PFX(name ## _16_ ## cpu(__VA_ARGS__)); \
+    ret PFX(name ## _32_ ## cpu(__VA_ARGS__)); \
+    ret PFX(name ## _64_ ## cpu(__VA_ARGS__))
+
+#define FUNCDEF_TU_S2(ret, name, cpu, ...) \
+    ret PFX(name ## 4_ ## cpu(__VA_ARGS__)); \
+    ret PFX(name ## 8_ ## cpu(__VA_ARGS__)); \
+    ret PFX(name ## 16_ ## cpu(__VA_ARGS__)); \
+    ret PFX(name ## 32_ ## cpu(__VA_ARGS__)); \
+    ret PFX(name ## 64_ ## cpu(__VA_ARGS__))
+
+#define FUNCDEF_PU(ret, name, cpu, ...) \
+    ret PFX(name ## _4x4_   ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _8x8_   ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _16x16_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _32x32_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _64x64_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _8x4_   ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _4x8_   ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _16x8_  ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _8x16_  ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _16x32_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _32x16_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _64x32_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _32x64_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _16x12_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _12x16_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _16x4_  ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _4x16_  ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _32x24_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _24x32_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _32x8_  ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _8x32_  ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _64x48_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _48x64_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _64x16_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _16x64_ ## cpu)(__VA_ARGS__)
+
+#define FUNCDEF_CHROMA_PU(ret, name, cpu, ...) \
+    FUNCDEF_PU(ret, name, cpu, __VA_ARGS__); \
+    ret PFX(name ## _4x2_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _4x4_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _2x4_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _8x2_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _2x8_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _8x6_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _6x8_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _8x12_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _12x8_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _6x16_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _16x6_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _2x16_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _16x2_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _4x12_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _12x4_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _32x12_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _12x32_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _32x4_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _4x32_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _32x48_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _48x32_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _16x24_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _24x16_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _8x64_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _64x8_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _64x24_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _24x64_ ## cpu)(__VA_ARGS__);
+
+#define DECLS(cpu) \
+    FUNCDEF_TU(void, cpy2Dto1D_shl, cpu, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); \
+    FUNCDEF_TU(void, cpy2Dto1D_shr, cpu, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); \
+    FUNCDEF_TU(void, cpy1Dto2D_shl, cpu, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); \
+    FUNCDEF_TU(void, cpy1Dto2D_shl_aligned, cpu, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); \
+    FUNCDEF_TU(void, cpy1Dto2D_shr, cpu, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); \
+    FUNCDEF_TU_S(uint32_t, copy_cnt, cpu, int16_t* dst, const int16_t* src, intptr_t srcStride); \
+    FUNCDEF_TU_S(int, count_nonzero, cpu, const int16_t* quantCoeff); \
+    FUNCDEF_TU(void, blockfill_s, cpu, int16_t* dst, intptr_t dstride, int16_t val); \
+    FUNCDEF_TU(void, blockfill_s_aligned, cpu, int16_t* dst, intptr_t dstride, int16_t val); \
+    FUNCDEF_CHROMA_PU(void, blockcopy_ss, cpu, int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); \
+    FUNCDEF_CHROMA_PU(void, blockcopy_pp, cpu, pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); \
+    FUNCDEF_PU(void, blockcopy_sp, cpu, pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); \
+    FUNCDEF_PU(void, blockcopy_ps, cpu, int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); \
+    FUNCDEF_PU(void, interp_8tap_horiz_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
+    FUNCDEF_PU(void, interp_8tap_horiz_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt); \
+    FUNCDEF_PU(void, interp_8tap_vert_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
+    FUNCDEF_PU(void, interp_8tap_vert_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \
+    FUNCDEF_PU(void, interp_8tap_vert_sp, cpu, const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
+    FUNCDEF_PU(void, interp_8tap_vert_ss, cpu, const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \
+    FUNCDEF_PU(void, interp_8tap_hv_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY); \
+    FUNCDEF_CHROMA_PU(void, filterPixelToShort, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride); \
+    FUNCDEF_CHROMA_PU(void, filterPixelToShort_aligned, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride); \
+    FUNCDEF_CHROMA_PU(void, interp_horiz_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
+    FUNCDEF_CHROMA_PU(void, interp_4tap_horiz_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
+    FUNCDEF_CHROMA_PU(void, interp_horiz_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt); \
+    FUNCDEF_CHROMA_PU(void, interp_4tap_horiz_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt); \
+    FUNCDEF_CHROMA_PU(void, interp_4tap_vert_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
+    FUNCDEF_CHROMA_PU(void, interp_4tap_vert_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \
+    FUNCDEF_CHROMA_PU(void, interp_4tap_vert_sp, cpu, const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
+    FUNCDEF_CHROMA_PU(void, interp_4tap_vert_ss, cpu, const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \
+    FUNCDEF_CHROMA_PU(void, addAvg, cpu, const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t); \
+    FUNCDEF_CHROMA_PU(void, addAvg_aligned, cpu, const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t); \
+    FUNCDEF_PU(void, pixel_avg_pp, cpu, pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int); \
+    FUNCDEF_PU(void, pixel_avg_pp_aligned, cpu, pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int); \
+    FUNCDEF_PU(void, sad_x3, cpu, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*); \
+    FUNCDEF_PU(void, sad_x4, cpu, const pixel*, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*); \
+    FUNCDEF_CHROMA_PU(int, pixel_sad, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
+    FUNCDEF_CHROMA_PU(sse_t, pixel_ssd_s, cpu, const int16_t*, intptr_t); \
+    FUNCDEF_CHROMA_PU(sse_t, pixel_ssd_s_aligned, cpu, const int16_t*, intptr_t); \
+    FUNCDEF_TU_S(sse_t, pixel_ssd_s, cpu, const int16_t*, intptr_t); \
+    FUNCDEF_TU_S(sse_t, pixel_ssd_s_aligned, cpu, const int16_t*, intptr_t); \
+    FUNCDEF_PU(sse_t, pixel_sse_pp, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
+    FUNCDEF_CHROMA_PU(sse_t, pixel_sse_ss, cpu, const int16_t*, intptr_t, const int16_t*, intptr_t); \
+    FUNCDEF_PU(void, pixel_sub_ps, cpu, int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1); \
+    FUNCDEF_PU(void, pixel_add_ps, cpu, pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1); \
+    FUNCDEF_PU(void, pixel_add_ps_aligned, cpu, pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1); \
+    FUNCDEF_CHROMA_PU(int, pixel_satd, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
+    FUNCDEF_TU_S2(void, ssimDist, cpu, const pixel *fenc, uint32_t fStride, const pixel *recon, intptr_t rstride, uint64_t *ssBlock, int shift, uint64_t *ac_k); \
+    FUNCDEF_TU_S2(void, normFact, cpu, const pixel *src, uint32_t blockSize, int shift, uint64_t *z_k)
+
+DECLS(neon);
+DECLS(sve);
+DECLS(sve2);
+
+
+void x265_pixel_planecopy_cp_neon(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
+
+uint64_t x265_pixel_var_8x8_neon(const pixel* pix, intptr_t stride);
+uint64_t x265_pixel_var_16x16_neon(const pixel* pix, intptr_t stride);
+uint64_t x265_pixel_var_32x32_neon(const pixel* pix, intptr_t stride);
+uint64_t x265_pixel_var_64x64_neon(const pixel* pix, intptr_t stride);
+
+void x265_getResidual4_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
+void x265_getResidual8_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
+void x265_getResidual16_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
+void x265_getResidual32_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
+
+void x265_scale1D_128to64_neon(pixel *dst, const pixel *src);
+void x265_scale2D_64to32_neon(pixel* dst, const pixel* src, intptr_t stride);
+
+int x265_pixel_satd_4x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_4x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_4x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_4x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_8x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_8x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_8x12_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_8x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_8x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_8x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_12x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_12x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_16x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_16x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_16x12_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_16x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_16x24_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_16x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_16x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_24x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_24x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_32x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_32x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_32x24_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_32x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_32x48_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_32x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);

 
@@ -0,0 +1,256 @@
+/*****************************************************************************
+ * Copyright (C) 2021 MulticoreWare, Inc
+ *
+ * Authors: Sebastian Pop <spop@amazon.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#define FUNCDEF_TU(ret, name, cpu, ...) \
+    ret PFX(name ## _4x4_ ## cpu(__VA_ARGS__)); \
+    ret PFX(name ## _8x8_ ## cpu(__VA_ARGS__)); \
+    ret PFX(name ## _16x16_ ## cpu(__VA_ARGS__)); \
+    ret PFX(name ## _32x32_ ## cpu(__VA_ARGS__)); \
+    ret PFX(name ## _64x64_ ## cpu(__VA_ARGS__))
+
+#define FUNCDEF_TU_S(ret, name, cpu, ...) \
+    ret PFX(name ## _4_ ## cpu(__VA_ARGS__)); \
+    ret PFX(name ## _8_ ## cpu(__VA_ARGS__)); \
+    ret PFX(name ## _16_ ## cpu(__VA_ARGS__)); \
+    ret PFX(name ## _32_ ## cpu(__VA_ARGS__)); \
+    ret PFX(name ## _64_ ## cpu(__VA_ARGS__))
+
+#define FUNCDEF_TU_S2(ret, name, cpu, ...) \
+    ret PFX(name ## 4_ ## cpu(__VA_ARGS__)); \
+    ret PFX(name ## 8_ ## cpu(__VA_ARGS__)); \
+    ret PFX(name ## 16_ ## cpu(__VA_ARGS__)); \
+    ret PFX(name ## 32_ ## cpu(__VA_ARGS__)); \
+    ret PFX(name ## 64_ ## cpu(__VA_ARGS__))
+
+#define FUNCDEF_PU(ret, name, cpu, ...) \
+    ret PFX(name ## _4x4_   ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _8x8_   ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _16x16_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _32x32_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _64x64_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _8x4_   ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _4x8_   ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _16x8_  ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _8x16_  ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _16x32_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _32x16_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _64x32_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _32x64_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _16x12_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _12x16_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _16x4_  ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _4x16_  ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _32x24_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _24x32_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _32x8_  ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _8x32_  ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _64x48_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _48x64_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _64x16_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _16x64_ ## cpu)(__VA_ARGS__)
+
+#define FUNCDEF_CHROMA_PU(ret, name, cpu, ...) \
+    FUNCDEF_PU(ret, name, cpu, __VA_ARGS__); \
+    ret PFX(name ## _4x2_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _4x4_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _2x4_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _8x2_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _2x8_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _8x6_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _6x8_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _8x12_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _12x8_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _6x16_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _16x6_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _2x16_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _16x2_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _4x12_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _12x4_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _32x12_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _12x32_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _32x4_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _4x32_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _32x48_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _48x32_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _16x24_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _24x16_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _8x64_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _64x8_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _64x24_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _24x64_ ## cpu)(__VA_ARGS__);
+
+#define DECLS(cpu) \
+    FUNCDEF_TU(void, cpy2Dto1D_shl, cpu, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); \
+    FUNCDEF_TU(void, cpy2Dto1D_shr, cpu, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); \
+    FUNCDEF_TU(void, cpy1Dto2D_shl, cpu, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); \
+    FUNCDEF_TU(void, cpy1Dto2D_shl_aligned, cpu, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); \
+    FUNCDEF_TU(void, cpy1Dto2D_shr, cpu, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); \
+    FUNCDEF_TU_S(uint32_t, copy_cnt, cpu, int16_t* dst, const int16_t* src, intptr_t srcStride); \
+    FUNCDEF_TU_S(int, count_nonzero, cpu, const int16_t* quantCoeff); \
+    FUNCDEF_TU(void, blockfill_s, cpu, int16_t* dst, intptr_t dstride, int16_t val); \
+    FUNCDEF_TU(void, blockfill_s_aligned, cpu, int16_t* dst, intptr_t dstride, int16_t val); \
+    FUNCDEF_CHROMA_PU(void, blockcopy_ss, cpu, int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); \
+    FUNCDEF_CHROMA_PU(void, blockcopy_pp, cpu, pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); \
+    FUNCDEF_PU(void, blockcopy_sp, cpu, pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); \
+    FUNCDEF_PU(void, blockcopy_ps, cpu, int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); \
+    FUNCDEF_PU(void, interp_8tap_horiz_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
+    FUNCDEF_PU(void, interp_8tap_horiz_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt); \
+    FUNCDEF_PU(void, interp_8tap_vert_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
+    FUNCDEF_PU(void, interp_8tap_vert_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \
+    FUNCDEF_PU(void, interp_8tap_vert_sp, cpu, const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
+    FUNCDEF_PU(void, interp_8tap_vert_ss, cpu, const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \
+    FUNCDEF_PU(void, interp_8tap_hv_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY); \
+    FUNCDEF_CHROMA_PU(void, filterPixelToShort, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride); \
+    FUNCDEF_CHROMA_PU(void, filterPixelToShort_aligned, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride); \
+    FUNCDEF_CHROMA_PU(void, interp_horiz_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
+    FUNCDEF_CHROMA_PU(void, interp_4tap_horiz_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
+    FUNCDEF_CHROMA_PU(void, interp_horiz_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt); \
+    FUNCDEF_CHROMA_PU(void, interp_4tap_horiz_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt); \
+    FUNCDEF_CHROMA_PU(void, interp_4tap_vert_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
+    FUNCDEF_CHROMA_PU(void, interp_4tap_vert_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \
+    FUNCDEF_CHROMA_PU(void, interp_4tap_vert_sp, cpu, const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
+    FUNCDEF_CHROMA_PU(void, interp_4tap_vert_ss, cpu, const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \
+    FUNCDEF_CHROMA_PU(void, addAvg, cpu, const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t); \
+    FUNCDEF_CHROMA_PU(void, addAvg_aligned, cpu, const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t); \
+    FUNCDEF_PU(void, pixel_avg_pp, cpu, pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int); \
+    FUNCDEF_PU(void, pixel_avg_pp_aligned, cpu, pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int); \
+    FUNCDEF_PU(void, sad_x3, cpu, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*); \
+    FUNCDEF_PU(void, sad_x4, cpu, const pixel*, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*); \
+    FUNCDEF_CHROMA_PU(int, pixel_sad, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
+    FUNCDEF_CHROMA_PU(sse_t, pixel_ssd_s, cpu, const int16_t*, intptr_t); \
+    FUNCDEF_CHROMA_PU(sse_t, pixel_ssd_s_aligned, cpu, const int16_t*, intptr_t); \
+    FUNCDEF_TU_S(sse_t, pixel_ssd_s, cpu, const int16_t*, intptr_t); \
+    FUNCDEF_TU_S(sse_t, pixel_ssd_s_aligned, cpu, const int16_t*, intptr_t); \
+    FUNCDEF_PU(sse_t, pixel_sse_pp, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
+    FUNCDEF_CHROMA_PU(sse_t, pixel_sse_ss, cpu, const int16_t*, intptr_t, const int16_t*, intptr_t); \
+    FUNCDEF_PU(void, pixel_sub_ps, cpu, int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1); \
+    FUNCDEF_PU(void, pixel_add_ps, cpu, pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1); \
+    FUNCDEF_PU(void, pixel_add_ps_aligned, cpu, pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1); \
+    FUNCDEF_CHROMA_PU(int, pixel_satd, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
+    FUNCDEF_TU_S2(void, ssimDist, cpu, const pixel *fenc, uint32_t fStride, const pixel *recon, intptr_t rstride, uint64_t *ssBlock, int shift, uint64_t *ac_k); \
+    FUNCDEF_TU_S2(void, normFact, cpu, const pixel *src, uint32_t blockSize, int shift, uint64_t *z_k)
+
+DECLS(neon);
+DECLS(sve);
+DECLS(sve2);
+
+
+void x265_pixel_planecopy_cp_neon(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
+
+uint64_t x265_pixel_var_8x8_neon(const pixel* pix, intptr_t stride);
+uint64_t x265_pixel_var_16x16_neon(const pixel* pix, intptr_t stride);
+uint64_t x265_pixel_var_32x32_neon(const pixel* pix, intptr_t stride);
+uint64_t x265_pixel_var_64x64_neon(const pixel* pix, intptr_t stride);
+
+void x265_getResidual4_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
+void x265_getResidual8_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
+void x265_getResidual16_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
+void x265_getResidual32_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
+
+void x265_scale1D_128to64_neon(pixel *dst, const pixel *src);
+void x265_scale2D_64to32_neon(pixel* dst, const pixel* src, intptr_t stride);
+
+int x265_pixel_satd_4x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_4x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_4x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_4x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_8x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_8x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_8x12_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_8x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_8x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_8x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_12x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_12x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_16x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_16x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_16x12_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_16x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_16x24_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_16x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_16x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_24x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_24x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_32x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_32x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_32x24_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_32x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_32x48_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_32x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
​

x265_3.6.tar.gz/source/common/aarch64/intrapred-prim.cpp Added

@@ -0,0 +1,265 @@
+#include "common.h"
+#include "primitives.h"
+
+
+#if 1
+#include "arm64-utils.h"
+#include <arm_neon.h>
+
+using namespace X265_NS;
+
+namespace
+{
+
+
+
+template<int width>
+void intra_pred_ang_neon(pixel *dst, intptr_t dstStride, const pixel *srcPix0, int dirMode, int bFilter)
+{
+    int width2 = width << 1;
+    // Flip the neighbours in the horizontal case.
+    int horMode = dirMode < 18;
+    pixel neighbourBuf129;
+    const pixel *srcPix = srcPix0;
+
+    if (horMode)
+    {
+        neighbourBuf0 = srcPix0;
+        //for (int i = 0; i < width << 1; i++)
+        //{
+        //    neighbourBuf1 + i = srcPixwidth2 + 1 + i;
+        //    neighbourBufwidth2 + 1 + i = srcPix1 + i;
+        //}
+        memcpy(&neighbourBuf1, &srcPixwidth2 + 1, sizeof(pixel) * (width << 1));
+        memcpy(&neighbourBufwidth2 + 1, &srcPix1, sizeof(pixel) * (width << 1));
+        srcPix = neighbourBuf;
+    }
+
+    // Intra prediction angle and inverse angle tables.
+    const int8_t angleTable17 = { -32, -26, -21, -17, -13, -9, -5, -2, 0, 2, 5, 9, 13, 17, 21, 26, 32 };
+    const int16_t invAngleTable8 = { 4096, 1638, 910, 630, 482, 390, 315, 256 };
+
+    // Get the prediction angle.
+    int angleOffset = horMode ? 10 - dirMode : dirMode - 26;
+    int angle = angleTable8 + angleOffset;
+
+    // Vertical Prediction.
+    if (!angle)
+    {
+        for (int y = 0; y < width; y++)
+        {
+            memcpy(&dsty * dstStride, srcPix + 1, sizeof(pixel)*width);
+        }
+        if (bFilter)
+        {
+            int topLeft = srcPix0, top = srcPix1;
+            for (int y = 0; y < width; y++)
+            {
+                dsty * dstStride = x265_clip((int16_t)(top + ((srcPixwidth2 + 1 + y - topLeft) >> 1)));
+            }
+        }
+    }
+    else // Angular prediction.
+    {
+        // Get the reference pixels. The reference base is the first pixel to the top (neighbourBuf1).
+        pixel refBuf64;
+        const pixel *ref;
+
+        // Use the projected left neighbours and the top neighbours.
+        if (angle < 0)
+        {
+            // Number of neighbours projected.
+            int nbProjected = -((width * angle) >> 5) - 1;
+            pixel *ref_pix = refBuf + nbProjected + 1;
+
+            // Project the neighbours.
+            int invAngle = invAngleTable- angleOffset - 1;
+            int invAngleSum = 128;
+            for (int i = 0; i < nbProjected; i++)
+            {
+                invAngleSum += invAngle;
+                ref_pix- 2 - i = srcPixwidth2 + (invAngleSum >> 8);
+            }
+
+            // Copy the top-left and top pixels.
+            //for (int i = 0; i < width + 1; i++)
+            //ref_pix-1 + i = srcPixi;
+
+            memcpy(&ref_pix-1, srcPix, (width + 1)*sizeof(pixel));
+            ref = ref_pix;
+        }
+        else // Use the top and top-right neighbours.
+        {
+            ref = srcPix + 1;
+        }
+
+        // Pass every row.
+        int angleSum = 0;
+        for (int y = 0; y < width; y++)
+        {
+            angleSum += angle;
+            int offset = angleSum >> 5;
+            int fraction = angleSum & 31;
+
+            if (fraction) // Interpolate
+            {
+                if (width >= 8 && sizeof(pixel) == 1)
+                {
+                    const int16x8_t f0 = vdupq_n_s16(32 - fraction);
+                    const int16x8_t f1 = vdupq_n_s16(fraction);
+                    for (int x = 0; x < width; x += 8)
+                    {
+                        uint8x8_t in0 = *(uint8x8_t *)&refoffset + x;
+                        uint8x8_t in1 = *(uint8x8_t *)&refoffset + x + 1;
+                        int16x8_t lo = vmlaq_s16(vdupq_n_s16(16), vmovl_u8(in0), f0);
+                        lo = vmlaq_s16(lo, vmovl_u8(in1), f1);
+                        lo = vshrq_n_s16(lo, 5);
+                        *(uint8x8_t *)&dsty * dstStride + x = vmovn_u16(lo);
+                    }
+                }
+                else if (width >= 4 && sizeof(pixel) == 2)
+                {
+                    const int32x4_t f0 = vdupq_n_s32(32 - fraction);
+                    const int32x4_t f1 = vdupq_n_s32(fraction);
+                    for (int x = 0; x < width; x += 4)
+                    {
+                        uint16x4_t in0 = *(uint16x4_t *)&refoffset + x;
+                        uint16x4_t in1 = *(uint16x4_t *)&refoffset + x + 1;
+                        int32x4_t lo = vmlaq_s32(vdupq_n_s32(16), vmovl_u16(in0), f0);
+                        lo = vmlaq_s32(lo, vmovl_u16(in1), f1);
+                        lo = vshrq_n_s32(lo, 5);
+                        *(uint16x4_t *)&dsty * dstStride + x = vmovn_u32(lo);
+                    }
+                }
+                else
+                {
+                    for (int x = 0; x < width; x++)
+                    {
+                        dsty * dstStride + x = (pixel)(((32 - fraction) * refoffset + x + fraction * refoffset + x + 1 + 16) >> 5);
+                    }
+                }
+            }
+            else // Copy.
+            {
+                memcpy(&dsty * dstStride, &refoffset, sizeof(pixel)*width);
+            }
+        }
+    }
+
+    // Flip for horizontal.
+    if (horMode)
+    {
+        if (width == 8)
+        {
+            transpose8x8(dst, dst, dstStride, dstStride);
+        }
+        else if (width == 16)
+        {
+            transpose16x16(dst, dst, dstStride, dstStride);
+        }
+        else if (width == 32)
+        {
+            transpose32x32(dst, dst, dstStride, dstStride);
+        }
+        else
+        {
+            for (int y = 0; y < width - 1; y++)
+            {
+                for (int x = y + 1; x < width; x++)
+                {
+                    pixel tmp              = dsty * dstStride + x;
+                    dsty * dstStride + x = dstx * dstStride + y;
+                    dstx * dstStride + y = tmp;
+                }
+            }
+        }
+    }
+}
+
+template<int log2Size>
+void all_angs_pred_neon(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma)
+{
+    const int size = 1 << log2Size;
+    for (int mode = 2; mode <= 34; mode++)
+    {
+        pixel *srcPix  = (g_intraFilterFlagsmode & size ? filtPix  : refPix);
+        pixel *out = dest + ((mode - 2) << (log2Size * 2));
+
+        intra_pred_ang_neon<size>(out, size, srcPix, mode, bLuma);
+
+        // Optimize code don't flip buffer
+        bool modeHor = (mode < 18);
+
+        // transpose the block if this is a horizontal mode
+        if (modeHor)
+        {
+            if (size == 8)
+            {
+                transpose8x8(out, out, size, size);
+            }

 
@@ -0,0 +1,265 @@
+#include "common.h"
+#include "primitives.h"
+
+
+#if 1
+#include "arm64-utils.h"
+#include <arm_neon.h>
+
+using namespace X265_NS;
+
+namespace
+{
+
+
+
+template<int width>
+void intra_pred_ang_neon(pixel *dst, intptr_t dstStride, const pixel *srcPix0, int dirMode, int bFilter)
+{
+    int width2 = width << 1;
+    // Flip the neighbours in the horizontal case.
+    int horMode = dirMode < 18;
+    pixel neighbourBuf129;
+    const pixel *srcPix = srcPix0;
+
+    if (horMode)
+    {
+        neighbourBuf0 = srcPix0;
+        //for (int i = 0; i < width << 1; i++)
+        //{
+        //    neighbourBuf1 + i = srcPixwidth2 + 1 + i;
+        //    neighbourBufwidth2 + 1 + i = srcPix1 + i;
+        //}
+        memcpy(&neighbourBuf1, &srcPixwidth2 + 1, sizeof(pixel) * (width << 1));
+        memcpy(&neighbourBufwidth2 + 1, &srcPix1, sizeof(pixel) * (width << 1));
+        srcPix = neighbourBuf;
+    }
+
+    // Intra prediction angle and inverse angle tables.
+    const int8_t angleTable17 = { -32, -26, -21, -17, -13, -9, -5, -2, 0, 2, 5, 9, 13, 17, 21, 26, 32 };
+    const int16_t invAngleTable8 = { 4096, 1638, 910, 630, 482, 390, 315, 256 };
+
+    // Get the prediction angle.
+    int angleOffset = horMode ? 10 - dirMode : dirMode - 26;
+    int angle = angleTable8 + angleOffset;
+
+    // Vertical Prediction.
+    if (!angle)
+    {
+        for (int y = 0; y < width; y++)
+        {
+            memcpy(&dsty * dstStride, srcPix + 1, sizeof(pixel)*width);
+        }
+        if (bFilter)
+        {
+            int topLeft = srcPix0, top = srcPix1;
+            for (int y = 0; y < width; y++)
+            {
+                dsty * dstStride = x265_clip((int16_t)(top + ((srcPixwidth2 + 1 + y - topLeft) >> 1)));
+            }
+        }
+    }
+    else // Angular prediction.
+    {
+        // Get the reference pixels. The reference base is the first pixel to the top (neighbourBuf1).
+        pixel refBuf64;
+        const pixel *ref;
+
+        // Use the projected left neighbours and the top neighbours.
+        if (angle < 0)
+        {
+            // Number of neighbours projected.
+            int nbProjected = -((width * angle) >> 5) - 1;
+            pixel *ref_pix = refBuf + nbProjected + 1;
+
+            // Project the neighbours.
+            int invAngle = invAngleTable- angleOffset - 1;
+            int invAngleSum = 128;
+            for (int i = 0; i < nbProjected; i++)
+            {
+                invAngleSum += invAngle;
+                ref_pix- 2 - i = srcPixwidth2 + (invAngleSum >> 8);
+            }
+
+            // Copy the top-left and top pixels.
+            //for (int i = 0; i < width + 1; i++)
+            //ref_pix-1 + i = srcPixi;
+
+            memcpy(&ref_pix-1, srcPix, (width + 1)*sizeof(pixel));
+            ref = ref_pix;
+        }
+        else // Use the top and top-right neighbours.
+        {
+            ref = srcPix + 1;
+        }
+
+        // Pass every row.
+        int angleSum = 0;
+        for (int y = 0; y < width; y++)
+        {
+            angleSum += angle;
+            int offset = angleSum >> 5;
+            int fraction = angleSum & 31;
+
+            if (fraction) // Interpolate
+            {
+                if (width >= 8 && sizeof(pixel) == 1)
+                {
+                    const int16x8_t f0 = vdupq_n_s16(32 - fraction);
+                    const int16x8_t f1 = vdupq_n_s16(fraction);
+                    for (int x = 0; x < width; x += 8)
+                    {
+                        uint8x8_t in0 = *(uint8x8_t *)&refoffset + x;
+                        uint8x8_t in1 = *(uint8x8_t *)&refoffset + x + 1;
+                        int16x8_t lo = vmlaq_s16(vdupq_n_s16(16), vmovl_u8(in0), f0);
+                        lo = vmlaq_s16(lo, vmovl_u8(in1), f1);
+                        lo = vshrq_n_s16(lo, 5);
+                        *(uint8x8_t *)&dsty * dstStride + x = vmovn_u16(lo);
+                    }
+                }
+                else if (width >= 4 && sizeof(pixel) == 2)
+                {
+                    const int32x4_t f0 = vdupq_n_s32(32 - fraction);
+                    const int32x4_t f1 = vdupq_n_s32(fraction);
+                    for (int x = 0; x < width; x += 4)
+                    {
+                        uint16x4_t in0 = *(uint16x4_t *)&refoffset + x;
+                        uint16x4_t in1 = *(uint16x4_t *)&refoffset + x + 1;
+                        int32x4_t lo = vmlaq_s32(vdupq_n_s32(16), vmovl_u16(in0), f0);
+                        lo = vmlaq_s32(lo, vmovl_u16(in1), f1);
+                        lo = vshrq_n_s32(lo, 5);
+                        *(uint16x4_t *)&dsty * dstStride + x = vmovn_u32(lo);
+                    }
+                }
+                else
+                {
+                    for (int x = 0; x < width; x++)
+                    {
+                        dsty * dstStride + x = (pixel)(((32 - fraction) * refoffset + x + fraction * refoffset + x + 1 + 16) >> 5);
+                    }
+                }
+            }
+            else // Copy.
+            {
+                memcpy(&dsty * dstStride, &refoffset, sizeof(pixel)*width);
+            }
+        }
+    }
+
+    // Flip for horizontal.
+    if (horMode)
+    {
+        if (width == 8)
+        {
+            transpose8x8(dst, dst, dstStride, dstStride);
+        }
+        else if (width == 16)
+        {
+            transpose16x16(dst, dst, dstStride, dstStride);
+        }
+        else if (width == 32)
+        {
+            transpose32x32(dst, dst, dstStride, dstStride);
+        }
+        else
+        {
+            for (int y = 0; y < width - 1; y++)
+            {
+                for (int x = y + 1; x < width; x++)
+                {
+                    pixel tmp              = dsty * dstStride + x;
+                    dsty * dstStride + x = dstx * dstStride + y;
+                    dstx * dstStride + y = tmp;
+                }
+            }
+        }
+    }
+}
+
+template<int log2Size>
+void all_angs_pred_neon(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma)
+{
+    const int size = 1 << log2Size;
+    for (int mode = 2; mode <= 34; mode++)
+    {
+        pixel *srcPix  = (g_intraFilterFlagsmode & size ? filtPix  : refPix);
+        pixel *out = dest + ((mode - 2) << (log2Size * 2));
+
+        intra_pred_ang_neon<size>(out, size, srcPix, mode, bLuma);
+
+        // Optimize code don't flip buffer
+        bool modeHor = (mode < 18);
+
+        // transpose the block if this is a horizontal mode
+        if (modeHor)
+        {
+            if (size == 8)
+            {
+                transpose8x8(out, out, size, size);
+            }
​

x265_3.6.tar.gz/source/common/aarch64/intrapred-prim.h Added

 
@@ -0,0 +1,15 @@
+#ifndef INTRAPRED_PRIM_H__
+
+#if defined(__aarch64__)
+
+namespace X265_NS
+{
+// x265 private namespace
+
+void setupIntraPrimitives_neon(EncoderPrimitives &p);
+}
+
+#endif
+
+#endif
+
​

x265_3.6.tar.gz/source/common/aarch64/ipfilter-common.S Added

@@ -0,0 +1,1436 @@
+/*****************************************************************************
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
+ *
+ * Authors: David Chen <david.chen@myais.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+// This file contains the macros written using NEON instruction set
+// that are also used by the SVE2 functions
+
+// Macros below follow these conventions:
+// - input data in registers: v0, v1, v2, v3, v4, v5, v6, v7
+// - constants in registers: v24, v25, v26, v27, v31
+// - temporary registers: v16, v17, v18, v19, v20, v21, v22, v23, v28, v29, v30.
+// - _32b macros output a result in v17.4s
+// - _64b and _32b_1 macros output results in v17.4s, v18.4s
+
+#include "asm.S"
+
+.arch           armv8-a
+
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
+.section .rodata
+#endif
+
+.align 4
+
+.macro vextin8 v
+    ldp             d6, d7, x11, #16
+.if \v == 0
+    // qpel_filter_0 only uses values in v3
+    ext             v3.8b, v6.8b, v7.8b, #4
+.else
+.if \v != 3
+    ext             v0.8b, v6.8b, v7.8b, #1
+.endif
+    ext             v1.8b, v6.8b, v7.8b, #2
+    ext             v2.8b, v6.8b, v7.8b, #3
+    ext             v3.8b, v6.8b, v7.8b, #4
+    ext             v4.8b, v6.8b, v7.8b, #5
+    ext             v5.8b, v6.8b, v7.8b, #6
+    ext             v6.8b, v6.8b, v7.8b, #7
+.endif
+.endm
+
+.macro vextin8_64 v
+    ldp             q6, q7, x11, #32
+.if \v == 0
+    // qpel_filter_0 only uses values in v3
+    ext             v3.16b, v6.16b, v7.16b, #4
+.else
+.if \v != 3
+    // qpel_filter_3 does not use values in v0
+    ext             v0.16b, v6.16b, v7.16b, #1
+.endif
+    ext             v1.16b, v6.16b, v7.16b, #2
+    ext             v2.16b, v6.16b, v7.16b, #3
+    ext             v3.16b, v6.16b, v7.16b, #4
+    ext             v4.16b, v6.16b, v7.16b, #5
+    ext             v5.16b, v6.16b, v7.16b, #6
+.if \v == 1
+    ext             v6.16b, v6.16b, v7.16b, #7
+    // qpel_filter_1 does not use v7
+.else
+    ext             v16.16b, v6.16b, v7.16b, #7
+    ext             v7.16b, v6.16b, v7.16b, #8
+    mov             v6.16b, v16.16b
+.endif
+.endif
+.endm
+
+.macro vextin8_chroma v
+    ldp             d6, d7, x11, #16
+.if \v == 0
+    // qpel_filter_chroma_0 only uses values in v1
+    ext             v1.8b, v6.8b, v7.8b, #2
+.else
+    ext             v0.8b, v6.8b, v7.8b, #1
+    ext             v1.8b, v6.8b, v7.8b, #2
+    ext             v2.8b, v6.8b, v7.8b, #3
+    ext             v3.8b, v6.8b, v7.8b, #4
+.endif
+.endm
+
+.macro vextin8_chroma_64 v
+    ldp             q16, q17, x11, #32
+.if \v == 0
+    // qpel_filter_chroma_0 only uses values in v1
+    ext             v1.16b, v16.16b, v17.16b, #2
+.else
+    ext             v0.16b, v16.16b, v17.16b, #1
+    ext             v1.16b, v16.16b, v17.16b, #2
+    ext             v2.16b, v16.16b, v17.16b, #3
+    ext             v3.16b, v16.16b, v17.16b, #4
+.endif
+.endm
+
+.macro qpel_load_32b v
+.if \v == 0
+    add             x6, x6, x11       // do not load 3 values that are not used in qpel_filter_0
+    ld1             {v3.8b}, x6, x1
+.elseif \v == 1 || \v == 2 || \v == 3
+.if \v != 3                           // not used in qpel_filter_3
+    ld1             {v0.8b}, x6, x1
+.else
+    add             x6, x6, x1
+.endif
+    ld1             {v1.8b}, x6, x1
+    ld1             {v2.8b}, x6, x1
+    ld1             {v3.8b}, x6, x1
+    ld1             {v4.8b}, x6, x1
+    ld1             {v5.8b}, x6, x1
+.if \v != 1                           // not used in qpel_filter_1
+    ld1             {v6.8b}, x6, x1
+    ld1             {v7.8b}, x6
+.else
+    ld1             {v6.8b}, x6
+.endif
+.endif
+.endm
+
+.macro qpel_load_64b v
+.if \v == 0
+    add             x6, x6, x11       // do not load 3 values that are not used in qpel_filter_0
+    ld1             {v3.16b}, x6, x1
+.elseif \v == 1 || \v == 2 || \v == 3
+.if \v != 3                           // not used in qpel_filter_3
+    ld1             {v0.16b}, x6, x1
+.else
+    add             x6, x6, x1
+.endif
+    ld1             {v1.16b}, x6, x1
+    ld1             {v2.16b}, x6, x1
+    ld1             {v3.16b}, x6, x1
+    ld1             {v4.16b}, x6, x1
+    ld1             {v5.16b}, x6, x1
+.if \v != 1                           // not used in qpel_filter_1
+    ld1             {v6.16b}, x6, x1
+    ld1             {v7.16b}, x6
+.else
+    ld1             {v6.16b}, x6
+.endif
+.endif
+.endm
+
+.macro qpel_chroma_load_32b v
+.if \v == 0
+    // qpel_filter_chroma_0 only uses values in v1
+    add             x6, x6, x1
+    ldr             d1, x6
+.else
+    ld1             {v0.8b}, x6, x1
+    ld1             {v1.8b}, x6, x1
+    ld1             {v2.8b}, x6, x1
+    ld1             {v3.8b}, x6
+.endif
+.endm
+
+.macro qpel_chroma_load_64b v
+.if \v == 0
+    // qpel_filter_chroma_0 only uses values in v1
+    add             x6, x6, x1
+    ldr             q1, x6
+.else
+    ld1             {v0.16b}, x6, x1
+    ld1             {v1.16b}, x6, x1
+    ld1             {v2.16b}, x6, x1
+    ld1             {v3.16b}, x6
+.endif
+.endm
+
+//          a, b,   c,  d,  e,   f, g,  h
+// .hword   0, 0,   0, 64,  0,   0, 0,  0
+.macro qpel_start_0
+    movi            v24.16b, #64
+.endm
+
+.macro qpel_filter_0_32b
+    umull           v17.8h, v3.8b, v24.8b    // 64*d
+.endm
+

 
@@ -0,0 +1,1436 @@
+/*****************************************************************************
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
+ *
+ * Authors: David Chen <david.chen@myais.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+// This file contains the macros written using NEON instruction set
+// that are also used by the SVE2 functions
+
+// Macros below follow these conventions:
+// - input data in registers: v0, v1, v2, v3, v4, v5, v6, v7
+// - constants in registers: v24, v25, v26, v27, v31
+// - temporary registers: v16, v17, v18, v19, v20, v21, v22, v23, v28, v29, v30.
+// - _32b macros output a result in v17.4s
+// - _64b and _32b_1 macros output results in v17.4s, v18.4s
+
+#include "asm.S"
+
+.arch           armv8-a
+
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
+.section .rodata
+#endif
+
+.align 4
+
+.macro vextin8 v
+    ldp             d6, d7, x11, #16
+.if \v == 0
+    // qpel_filter_0 only uses values in v3
+    ext             v3.8b, v6.8b, v7.8b, #4
+.else
+.if \v != 3
+    ext             v0.8b, v6.8b, v7.8b, #1
+.endif
+    ext             v1.8b, v6.8b, v7.8b, #2
+    ext             v2.8b, v6.8b, v7.8b, #3
+    ext             v3.8b, v6.8b, v7.8b, #4
+    ext             v4.8b, v6.8b, v7.8b, #5
+    ext             v5.8b, v6.8b, v7.8b, #6
+    ext             v6.8b, v6.8b, v7.8b, #7
+.endif
+.endm
+
+.macro vextin8_64 v
+    ldp             q6, q7, x11, #32
+.if \v == 0
+    // qpel_filter_0 only uses values in v3
+    ext             v3.16b, v6.16b, v7.16b, #4
+.else
+.if \v != 3
+    // qpel_filter_3 does not use values in v0
+    ext             v0.16b, v6.16b, v7.16b, #1
+.endif
+    ext             v1.16b, v6.16b, v7.16b, #2
+    ext             v2.16b, v6.16b, v7.16b, #3
+    ext             v3.16b, v6.16b, v7.16b, #4
+    ext             v4.16b, v6.16b, v7.16b, #5
+    ext             v5.16b, v6.16b, v7.16b, #6
+.if \v == 1
+    ext             v6.16b, v6.16b, v7.16b, #7
+    // qpel_filter_1 does not use v7
+.else
+    ext             v16.16b, v6.16b, v7.16b, #7
+    ext             v7.16b, v6.16b, v7.16b, #8
+    mov             v6.16b, v16.16b
+.endif
+.endif
+.endm
+
+.macro vextin8_chroma v
+    ldp             d6, d7, x11, #16
+.if \v == 0
+    // qpel_filter_chroma_0 only uses values in v1
+    ext             v1.8b, v6.8b, v7.8b, #2
+.else
+    ext             v0.8b, v6.8b, v7.8b, #1
+    ext             v1.8b, v6.8b, v7.8b, #2
+    ext             v2.8b, v6.8b, v7.8b, #3
+    ext             v3.8b, v6.8b, v7.8b, #4
+.endif
+.endm
+
+.macro vextin8_chroma_64 v
+    ldp             q16, q17, x11, #32
+.if \v == 0
+    // qpel_filter_chroma_0 only uses values in v1
+    ext             v1.16b, v16.16b, v17.16b, #2
+.else
+    ext             v0.16b, v16.16b, v17.16b, #1
+    ext             v1.16b, v16.16b, v17.16b, #2
+    ext             v2.16b, v16.16b, v17.16b, #3
+    ext             v3.16b, v16.16b, v17.16b, #4
+.endif
+.endm
+
+.macro qpel_load_32b v
+.if \v == 0
+    add             x6, x6, x11       // do not load 3 values that are not used in qpel_filter_0
+    ld1             {v3.8b}, x6, x1
+.elseif \v == 1 || \v == 2 || \v == 3
+.if \v != 3                           // not used in qpel_filter_3
+    ld1             {v0.8b}, x6, x1
+.else
+    add             x6, x6, x1
+.endif
+    ld1             {v1.8b}, x6, x1
+    ld1             {v2.8b}, x6, x1
+    ld1             {v3.8b}, x6, x1
+    ld1             {v4.8b}, x6, x1
+    ld1             {v5.8b}, x6, x1
+.if \v != 1                           // not used in qpel_filter_1
+    ld1             {v6.8b}, x6, x1
+    ld1             {v7.8b}, x6
+.else
+    ld1             {v6.8b}, x6
+.endif
+.endif
+.endm
+
+.macro qpel_load_64b v
+.if \v == 0
+    add             x6, x6, x11       // do not load 3 values that are not used in qpel_filter_0
+    ld1             {v3.16b}, x6, x1
+.elseif \v == 1 || \v == 2 || \v == 3
+.if \v != 3                           // not used in qpel_filter_3
+    ld1             {v0.16b}, x6, x1
+.else
+    add             x6, x6, x1
+.endif
+    ld1             {v1.16b}, x6, x1
+    ld1             {v2.16b}, x6, x1
+    ld1             {v3.16b}, x6, x1
+    ld1             {v4.16b}, x6, x1
+    ld1             {v5.16b}, x6, x1
+.if \v != 1                           // not used in qpel_filter_1
+    ld1             {v6.16b}, x6, x1
+    ld1             {v7.16b}, x6
+.else
+    ld1             {v6.16b}, x6
+.endif
+.endif
+.endm
+
+.macro qpel_chroma_load_32b v
+.if \v == 0
+    // qpel_filter_chroma_0 only uses values in v1
+    add             x6, x6, x1
+    ldr             d1, x6
+.else
+    ld1             {v0.8b}, x6, x1
+    ld1             {v1.8b}, x6, x1
+    ld1             {v2.8b}, x6, x1
+    ld1             {v3.8b}, x6
+.endif
+.endm
+
+.macro qpel_chroma_load_64b v
+.if \v == 0
+    // qpel_filter_chroma_0 only uses values in v1
+    add             x6, x6, x1
+    ldr             q1, x6
+.else
+    ld1             {v0.16b}, x6, x1
+    ld1             {v1.16b}, x6, x1
+    ld1             {v2.16b}, x6, x1
+    ld1             {v3.16b}, x6
+.endif
+.endm
+
+//          a, b,   c,  d,  e,   f, g,  h
+// .hword   0, 0,   0, 64,  0,   0, 0,  0
+.macro qpel_start_0
+    movi            v24.16b, #64
+.endm
+
+.macro qpel_filter_0_32b
+    umull           v17.8h, v3.8b, v24.8b    // 64*d
+.endm
+
​

x265_3.6.tar.gz/source/common/aarch64/ipfilter-sve2.S Added

@@ -0,0 +1,1282 @@
+/*****************************************************************************
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
+ *
+ * Authors: David Chen <david.chen@myais.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+// Functions in this file:
+// ***** luma_vpp *****
+// ***** luma_vps *****
+// ***** luma_vsp *****
+// ***** luma_vss *****
+// ***** luma_hpp *****
+// ***** luma_hps *****
+// ***** chroma_vpp *****
+// ***** chroma_vps *****
+// ***** chroma_vsp *****
+// ***** chroma_vss *****
+// ***** chroma_hpp *****
+// ***** chroma_hps *****
+
+#include "asm-sve.S"
+#include "ipfilter-common.S"
+
+.arch armv8-a+sve2
+
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
+.section .rodata
+#endif
+
+.align 4
+
+.text
+
+.macro qpel_load_32b_sve2 v
+.if \v == 0
+    add             x6, x6, x11       // do not load 3 values that are not used in qpel_filter_0
+    ld1b            {z3.h}, p0/z, x6
+    add             x6, x6, x1
+.elseif \v == 1 || \v == 2 || \v == 3
+.if \v != 3                           // not used in qpel_filter_3
+    ld1b            {z0.h}, p0/z, x6
+    add             x6, x6, x1
+.else
+    add             x6, x6, x1
+.endif
+    ld1b            {z1.h}, p0/z, x6
+    add             x6, x6, x1
+    ld1b            {z2.h}, p0/z, x6
+    add             x6, x6, x1
+    ld1b            {z3.h}, p0/z, x6
+    add             x6, x6, x1
+    ld1b            {z4.h}, p0/z, x6
+    add             x6, x6, x1
+    ld1b            {z5.h}, p0/z, x6
+    add             x6, x6, x1
+.if \v != 1                           // not used in qpel_filter_1
+    ld1b            {z6.h}, p0/z, x6
+    add             x6, x6, x1
+    ld1b            {z7.h}, p0/z, x6
+.else
+    ld1b            {z6.h}, p0/z, x6
+.endif
+.endif
+.endm
+
+.macro qpel_load_64b_sve2_gt_16 v
+.if \v == 0
+    add             x6, x6, x11       // do not load 3 values that are not used in qpel_filter_0
+    ld1b            {z3.h}, p2/z, x6
+    add             x6, x6, x1
+.elseif \v == 1 || \v == 2 || \v == 3
+.if \v != 3                           // not used in qpel_filter_3
+    ld1b            {z0.h}, p2/z, x6
+    add             x6, x6, x1
+.else
+    add             x6, x6, x1
+.endif
+    ld1b            {z1.h}, p2/z, x6
+    add             x6, x6, x1
+    ld1b            {z2.h}, p2/z, x6
+    add             x6, x6, x1
+    ld1b            {z3.h}, p2/z, x6
+    add             x6, x6, x1
+    ld1b            {z4.h}, p2/z, x6
+    add             x6, x6, x1
+    ld1b            {z5.h}, p2/z, x6
+    add             x6, x6, x1
+.if \v != 1                           // not used in qpel_filter_1
+    ld1b            {z6.h}, p2/z, x6
+    add             x6, x6, x1
+    ld1b            {z7.h}, p2/z, x6
+.else
+    ld1b            {z6.h}, p2/z, x6
+.endif
+.endif
+.endm
+
+.macro qpel_chroma_load_32b_sve2 v
+.if \v == 0
+    // qpel_filter_chroma_0 only uses values in v1
+    add             x6, x6, x1
+    ld1b            {z1.h}, p0/z, x6
+.else
+    ld1b            {z0.h}, p0/z, x6
+    add             x6, x6, x1
+    ld1b            {z1.h}, p0/z, x6
+    add             x6, x6, x1
+    ld1b            {z2.h}, p0/z, x6
+    add             x6, x6, x1
+    ld1b            {z3.h}, p0/z, x6
+.endif
+.endm
+
+.macro qpel_start_sve2_0
+    mov             z24.h, #64
+.endm
+
+.macro qpel_filter_sve2_0_32b
+    mul             z17.h, z3.h, z24.h    // 64*d
+.endm
+
+.macro qpel_filter_sve2_0_64b
+    qpel_filter_sve2_0_32b
+    mul             z18.h, z11.h, z24.h
+.endm
+
+.macro qpel_start_sve2_1
+    mov             z24.h, #58
+    mov             z25.h, #10
+    mov             z26.h, #17
+    mov             z27.h, #5
+.endm
+
+.macro qpel_filter_sve2_1_32b
+    mul             z19.h, z2.h, z25.h  // c*10
+    mul             z17.h, z3.h, z24.h  // d*58
+    mul             z21.h, z4.h, z26.h  // e*17
+    mul             z23.h, z5.h, z27.h  // f*5
+    sub             z17.h, z17.h, z19.h // d*58 - c*10
+    lsl             z18.h, z1.h, #2      // b*4
+    add             z17.h, z17.h, z21.h // d*58 - c*10 + e*17
+    sub             z21.h, z6.h, z0.h   // g - a
+    add             z17.h, z17.h, z18.h // d*58 - c*10 + e*17 + b*4
+    sub             z21.h, z21.h, z23.h // g - a - f*5
+    add             z17.h, z17.h, z21.h // d*58 - c*10 + e*17 + b*4 + g - a - f*5
+.endm
+
+.macro qpel_filter_sve2_1_64b
+    qpel_filter_sve2_1_32b
+    mul             z20.h, z10.h, z25.h  // c*10
+    mul             z18.h, z11.h, z24.h  // d*58
+    mul             z21.h, z12.h, z26.h  // e*17
+    mul             z23.h, z13.h, z27.h  // f*5
+    sub             z18.h, z18.h, z20.h   // d*58 - c*10
+    lsl             z28.h, z30.h, #2       // b*4
+    add             z18.h, z18.h, z21.h   // d*58 - c*10 + e*17
+    sub             z21.h, z14.h, z29.h   // g - a
+    add             z18.h, z18.h, z28.h   // d*58 - c*10 + e*17 + b*4
+    sub             z21.h, z21.h, z23.h   // g - a - f*5
+    add             z18.h, z18.h, z21.h   // d*58 - c*10 + e*17 + b*4 + g - a - f*5
+.endm
+
+.macro qpel_start_sve2_2
+    mov             z24.h, #11
+    mov             z25.h, #40
+.endm
+
+.macro qpel_filter_sve2_2_32b
+    add             z17.h, z3.h, z4.h     // d + e
+    add             z19.h, z2.h, z5.h     // c + f
+    add             z23.h, z1.h, z6.h     // b + g
+    add             z21.h, z0.h, z7.h     // a + h
+    mul             z17.h, z17.h, z25.h   // 40 * (d + e)
+    mul             z19.h, z19.h, z24.h   // 11 * (c + f)
+    lsl             z23.h, z23.h, #2       // (b + g) * 4
+    add             z19.h, z19.h, z21.h   // 11 * (c + f) + a + h
+    add             z17.h, z17.h, z23.h   // 40 * (d + e) + (b + g) * 4
+    sub             z17.h, z17.h, z19.h   // 40 * (d + e) + (b + g) * 4 - 11 * (c + f) - a - h
+.endm
+

 
@@ -0,0 +1,1282 @@
+/*****************************************************************************
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
+ *
+ * Authors: David Chen <david.chen@myais.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+// Functions in this file:
+// ***** luma_vpp *****
+// ***** luma_vps *****
+// ***** luma_vsp *****
+// ***** luma_vss *****
+// ***** luma_hpp *****
+// ***** luma_hps *****
+// ***** chroma_vpp *****
+// ***** chroma_vps *****
+// ***** chroma_vsp *****
+// ***** chroma_vss *****
+// ***** chroma_hpp *****
+// ***** chroma_hps *****
+
+#include "asm-sve.S"
+#include "ipfilter-common.S"
+
+.arch armv8-a+sve2
+
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
+.section .rodata
+#endif
+
+.align 4
+
+.text
+
+.macro qpel_load_32b_sve2 v
+.if \v == 0
+    add             x6, x6, x11       // do not load 3 values that are not used in qpel_filter_0
+    ld1b            {z3.h}, p0/z, x6
+    add             x6, x6, x1
+.elseif \v == 1 || \v == 2 || \v == 3
+.if \v != 3                           // not used in qpel_filter_3
+    ld1b            {z0.h}, p0/z, x6
+    add             x6, x6, x1
+.else
+    add             x6, x6, x1
+.endif
+    ld1b            {z1.h}, p0/z, x6
+    add             x6, x6, x1
+    ld1b            {z2.h}, p0/z, x6
+    add             x6, x6, x1
+    ld1b            {z3.h}, p0/z, x6
+    add             x6, x6, x1
+    ld1b            {z4.h}, p0/z, x6
+    add             x6, x6, x1
+    ld1b            {z5.h}, p0/z, x6
+    add             x6, x6, x1
+.if \v != 1                           // not used in qpel_filter_1
+    ld1b            {z6.h}, p0/z, x6
+    add             x6, x6, x1
+    ld1b            {z7.h}, p0/z, x6
+.else
+    ld1b            {z6.h}, p0/z, x6
+.endif
+.endif
+.endm
+
+.macro qpel_load_64b_sve2_gt_16 v
+.if \v == 0
+    add             x6, x6, x11       // do not load 3 values that are not used in qpel_filter_0
+    ld1b            {z3.h}, p2/z, x6
+    add             x6, x6, x1
+.elseif \v == 1 || \v == 2 || \v == 3
+.if \v != 3                           // not used in qpel_filter_3
+    ld1b            {z0.h}, p2/z, x6
+    add             x6, x6, x1
+.else
+    add             x6, x6, x1
+.endif
+    ld1b            {z1.h}, p2/z, x6
+    add             x6, x6, x1
+    ld1b            {z2.h}, p2/z, x6
+    add             x6, x6, x1
+    ld1b            {z3.h}, p2/z, x6
+    add             x6, x6, x1
+    ld1b            {z4.h}, p2/z, x6
+    add             x6, x6, x1
+    ld1b            {z5.h}, p2/z, x6
+    add             x6, x6, x1
+.if \v != 1                           // not used in qpel_filter_1
+    ld1b            {z6.h}, p2/z, x6
+    add             x6, x6, x1
+    ld1b            {z7.h}, p2/z, x6
+.else
+    ld1b            {z6.h}, p2/z, x6
+.endif
+.endif
+.endm
+
+.macro qpel_chroma_load_32b_sve2 v
+.if \v == 0
+    // qpel_filter_chroma_0 only uses values in v1
+    add             x6, x6, x1
+    ld1b            {z1.h}, p0/z, x6
+.else
+    ld1b            {z0.h}, p0/z, x6
+    add             x6, x6, x1
+    ld1b            {z1.h}, p0/z, x6
+    add             x6, x6, x1
+    ld1b            {z2.h}, p0/z, x6
+    add             x6, x6, x1
+    ld1b            {z3.h}, p0/z, x6
+.endif
+.endm
+
+.macro qpel_start_sve2_0
+    mov             z24.h, #64
+.endm
+
+.macro qpel_filter_sve2_0_32b
+    mul             z17.h, z3.h, z24.h    // 64*d
+.endm
+
+.macro qpel_filter_sve2_0_64b
+    qpel_filter_sve2_0_32b
+    mul             z18.h, z11.h, z24.h
+.endm
+
+.macro qpel_start_sve2_1
+    mov             z24.h, #58
+    mov             z25.h, #10
+    mov             z26.h, #17
+    mov             z27.h, #5
+.endm
+
+.macro qpel_filter_sve2_1_32b
+    mul             z19.h, z2.h, z25.h  // c*10
+    mul             z17.h, z3.h, z24.h  // d*58
+    mul             z21.h, z4.h, z26.h  // e*17
+    mul             z23.h, z5.h, z27.h  // f*5
+    sub             z17.h, z17.h, z19.h // d*58 - c*10
+    lsl             z18.h, z1.h, #2      // b*4
+    add             z17.h, z17.h, z21.h // d*58 - c*10 + e*17
+    sub             z21.h, z6.h, z0.h   // g - a
+    add             z17.h, z17.h, z18.h // d*58 - c*10 + e*17 + b*4
+    sub             z21.h, z21.h, z23.h // g - a - f*5
+    add             z17.h, z17.h, z21.h // d*58 - c*10 + e*17 + b*4 + g - a - f*5
+.endm
+
+.macro qpel_filter_sve2_1_64b
+    qpel_filter_sve2_1_32b
+    mul             z20.h, z10.h, z25.h  // c*10
+    mul             z18.h, z11.h, z24.h  // d*58
+    mul             z21.h, z12.h, z26.h  // e*17
+    mul             z23.h, z13.h, z27.h  // f*5
+    sub             z18.h, z18.h, z20.h   // d*58 - c*10
+    lsl             z28.h, z30.h, #2       // b*4
+    add             z18.h, z18.h, z21.h   // d*58 - c*10 + e*17
+    sub             z21.h, z14.h, z29.h   // g - a
+    add             z18.h, z18.h, z28.h   // d*58 - c*10 + e*17 + b*4
+    sub             z21.h, z21.h, z23.h   // g - a - f*5
+    add             z18.h, z18.h, z21.h   // d*58 - c*10 + e*17 + b*4 + g - a - f*5
+.endm
+
+.macro qpel_start_sve2_2
+    mov             z24.h, #11
+    mov             z25.h, #40
+.endm
+
+.macro qpel_filter_sve2_2_32b
+    add             z17.h, z3.h, z4.h     // d + e
+    add             z19.h, z2.h, z5.h     // c + f
+    add             z23.h, z1.h, z6.h     // b + g
+    add             z21.h, z0.h, z7.h     // a + h
+    mul             z17.h, z17.h, z25.h   // 40 * (d + e)
+    mul             z19.h, z19.h, z24.h   // 11 * (c + f)
+    lsl             z23.h, z23.h, #2       // (b + g) * 4
+    add             z19.h, z19.h, z21.h   // 11 * (c + f) + a + h
+    add             z17.h, z17.h, z23.h   // 40 * (d + e) + (b + g) * 4
+    sub             z17.h, z17.h, z19.h   // 40 * (d + e) + (b + g) * 4 - 11 * (c + f) - a - h
+.endm
+
​

x265_3.6.tar.gz/source/common/aarch64/ipfilter.S Added

@@ -0,0 +1,1054 @@
+/*****************************************************************************
+ * Copyright (C) 2021 MulticoreWare, Inc
+ *
+ * Authors: Sebastian Pop <spop@amazon.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+// Functions in this file:
+// ***** luma_vpp *****
+// ***** luma_vps *****
+// ***** luma_vsp *****
+// ***** luma_vss *****
+// ***** luma_hpp *****
+// ***** luma_hps *****
+// ***** chroma_vpp *****
+// ***** chroma_vps *****
+// ***** chroma_vsp *****
+// ***** chroma_vss *****
+// ***** chroma_hpp *****
+// ***** chroma_hps *****
+
+#include "asm.S"
+#include "ipfilter-common.S"
+
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
+.section .rodata
+#endif
+
+.align 4
+
+.text
+
+// ***** luma_vpp *****
+// void interp_vert_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
+.macro LUMA_VPP_4xN h
+function x265_interp_8tap_vert_pp_4x\h\()_neon
+    movrel          x10, g_luma_s16
+    sub             x0, x0, x1
+    sub             x0, x0, x1, lsl #1         // src -= 3 * srcStride
+    lsl             x4, x4, #4
+    ldr             q0, x10, x4              // q0 = luma interpolate coeff
+    dup             v24.8h, v0.h0
+    dup             v25.8h, v0.h1
+    trn1            v24.2d, v24.2d, v25.2d
+    dup             v26.8h, v0.h2
+    dup             v27.8h, v0.h3
+    trn1            v26.2d, v26.2d, v27.2d
+    dup             v28.8h, v0.h4
+    dup             v29.8h, v0.h5
+    trn1            v28.2d, v28.2d, v29.2d
+    dup             v30.8h, v0.h6
+    dup             v31.8h, v0.h7
+    trn1            v30.2d, v30.2d, v31.2d
+
+    // prepare to load 8 lines
+    ld1             {v0.s}0, x0, x1
+    ld1             {v0.s}1, x0, x1
+    ushll           v0.8h, v0.8b, #0
+    ld1             {v1.s}0, x0, x1
+    ld1             {v1.s}1, x0, x1
+    ushll           v1.8h, v1.8b, #0
+    ld1             {v2.s}0, x0, x1
+    ld1             {v2.s}1, x0, x1
+    ushll           v2.8h, v2.8b, #0
+    ld1             {v3.s}0, x0, x1
+    ld1             {v3.s}1, x0, x1
+    ushll           v3.8h, v3.8b, #0
+
+    mov             x9, #\h
+.loop_4x\h:
+    ld1             {v4.s}0, x0, x1
+    ld1             {v4.s}1, x0, x1
+    ushll           v4.8h, v4.8b, #0
+
+    // row0-1
+    mul             v16.8h, v0.8h, v24.8h
+    ext             v21.16b, v0.16b, v1.16b, #8
+    mul             v17.8h, v21.8h, v24.8h
+    mov             v0.16b, v1.16b
+
+    // row2-3
+    mla             v16.8h, v1.8h, v26.8h
+    ext             v21.16b, v1.16b, v2.16b, #8
+    mla             v17.8h, v21.8h, v26.8h
+    mov             v1.16b, v2.16b
+
+    // row4-5
+    mla             v16.8h, v2.8h, v28.8h
+    ext             v21.16b, v2.16b, v3.16b, #8
+    mla             v17.8h, v21.8h, v28.8h
+    mov             v2.16b, v3.16b
+
+    // row6-7
+    mla             v16.8h, v3.8h, v30.8h
+    ext             v21.16b, v3.16b, v4.16b, #8
+    mla             v17.8h, v21.8h, v30.8h
+    mov             v3.16b, v4.16b
+
+    // sum row0-7
+    trn1            v20.2d, v16.2d, v17.2d
+    trn2            v21.2d, v16.2d, v17.2d
+    add             v16.8h, v20.8h, v21.8h
+
+    sqrshrun        v16.8b,  v16.8h,  #6
+    st1             {v16.s}0, x2, x3
+    st1             {v16.s}1, x2, x3
+
+    sub             x9, x9, #2
+    cbnz            x9, .loop_4x\h
+    ret
+endfunc
+.endm
+
+LUMA_VPP_4xN 4
+LUMA_VPP_4xN 8
+LUMA_VPP_4xN 16
+
+// void interp_vert_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
+.macro LUMA_VPP w, h
+function x265_interp_8tap_vert_pp_\w\()x\h\()_neon
+    cmp             x4, #0
+    b.eq            0f
+    cmp             x4, #1
+    b.eq            1f
+    cmp             x4, #2
+    b.eq            2f
+    cmp             x4, #3
+    b.eq            3f
+0:
+    FILTER_LUMA_VPP \w, \h, 0
+1:
+    FILTER_LUMA_VPP \w, \h, 1
+2:
+    FILTER_LUMA_VPP \w, \h, 2
+3:
+    FILTER_LUMA_VPP \w, \h, 3
+endfunc
+.endm
+
+LUMA_VPP 8, 4
+LUMA_VPP 8, 8
+LUMA_VPP 8, 16
+LUMA_VPP 8, 32
+LUMA_VPP 12, 16
+LUMA_VPP 16, 4
+LUMA_VPP 16, 8
+LUMA_VPP 16, 16
+LUMA_VPP 16, 32
+LUMA_VPP 16, 64
+LUMA_VPP 16, 12
+LUMA_VPP 24, 32
+LUMA_VPP 32, 8
+LUMA_VPP 32, 16
+LUMA_VPP 32, 32
+LUMA_VPP 32, 64
+LUMA_VPP 32, 24
+LUMA_VPP 48, 64
+LUMA_VPP 64, 16
+LUMA_VPP 64, 32
+LUMA_VPP 64, 64
+LUMA_VPP 64, 48
+
+// ***** luma_vps *****
+// void interp_vert_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
+.macro LUMA_VPS_4xN h
+function x265_interp_8tap_vert_ps_4x\h\()_neon
+    lsl             x3, x3, #1
+    lsl             x5, x4, #6
+    lsl             x4, x1, #2
+    sub             x4, x4, x1
+    sub             x0, x0, x4
+
+    mov             w6, #8192
+    dup             v28.4s, w6
+    mov             x4, #\h
+    movrel          x12, g_lumaFilter
+    add             x12, x12, x5
+    ld1r            {v16.2d}, x12, #8
+    ld1r            {v17.2d}, x12, #8
+    ld1r            {v18.2d}, x12, #8
+    ld1r            {v19.2d}, x12, #8

 
@@ -0,0 +1,1054 @@
+/*****************************************************************************
+ * Copyright (C) 2021 MulticoreWare, Inc
+ *
+ * Authors: Sebastian Pop <spop@amazon.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+// Functions in this file:
+// ***** luma_vpp *****
+// ***** luma_vps *****
+// ***** luma_vsp *****
+// ***** luma_vss *****
+// ***** luma_hpp *****
+// ***** luma_hps *****
+// ***** chroma_vpp *****
+// ***** chroma_vps *****
+// ***** chroma_vsp *****
+// ***** chroma_vss *****
+// ***** chroma_hpp *****
+// ***** chroma_hps *****
+
+#include "asm.S"
+#include "ipfilter-common.S"
+
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
+.section .rodata
+#endif
+
+.align 4
+
+.text
+
+// ***** luma_vpp *****
+// void interp_vert_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
+.macro LUMA_VPP_4xN h
+function x265_interp_8tap_vert_pp_4x\h\()_neon
+    movrel          x10, g_luma_s16
+    sub             x0, x0, x1
+    sub             x0, x0, x1, lsl #1         // src -= 3 * srcStride
+    lsl             x4, x4, #4
+    ldr             q0, x10, x4              // q0 = luma interpolate coeff
+    dup             v24.8h, v0.h0
+    dup             v25.8h, v0.h1
+    trn1            v24.2d, v24.2d, v25.2d
+    dup             v26.8h, v0.h2
+    dup             v27.8h, v0.h3
+    trn1            v26.2d, v26.2d, v27.2d
+    dup             v28.8h, v0.h4
+    dup             v29.8h, v0.h5
+    trn1            v28.2d, v28.2d, v29.2d
+    dup             v30.8h, v0.h6
+    dup             v31.8h, v0.h7
+    trn1            v30.2d, v30.2d, v31.2d
+
+    // prepare to load 8 lines
+    ld1             {v0.s}0, x0, x1
+    ld1             {v0.s}1, x0, x1
+    ushll           v0.8h, v0.8b, #0
+    ld1             {v1.s}0, x0, x1
+    ld1             {v1.s}1, x0, x1
+    ushll           v1.8h, v1.8b, #0
+    ld1             {v2.s}0, x0, x1
+    ld1             {v2.s}1, x0, x1
+    ushll           v2.8h, v2.8b, #0
+    ld1             {v3.s}0, x0, x1
+    ld1             {v3.s}1, x0, x1
+    ushll           v3.8h, v3.8b, #0
+
+    mov             x9, #\h
+.loop_4x\h:
+    ld1             {v4.s}0, x0, x1
+    ld1             {v4.s}1, x0, x1
+    ushll           v4.8h, v4.8b, #0
+
+    // row0-1
+    mul             v16.8h, v0.8h, v24.8h
+    ext             v21.16b, v0.16b, v1.16b, #8
+    mul             v17.8h, v21.8h, v24.8h
+    mov             v0.16b, v1.16b
+
+    // row2-3
+    mla             v16.8h, v1.8h, v26.8h
+    ext             v21.16b, v1.16b, v2.16b, #8
+    mla             v17.8h, v21.8h, v26.8h
+    mov             v1.16b, v2.16b
+
+    // row4-5
+    mla             v16.8h, v2.8h, v28.8h
+    ext             v21.16b, v2.16b, v3.16b, #8
+    mla             v17.8h, v21.8h, v28.8h
+    mov             v2.16b, v3.16b
+
+    // row6-7
+    mla             v16.8h, v3.8h, v30.8h
+    ext             v21.16b, v3.16b, v4.16b, #8
+    mla             v17.8h, v21.8h, v30.8h
+    mov             v3.16b, v4.16b
+
+    // sum row0-7
+    trn1            v20.2d, v16.2d, v17.2d
+    trn2            v21.2d, v16.2d, v17.2d
+    add             v16.8h, v20.8h, v21.8h
+
+    sqrshrun        v16.8b,  v16.8h,  #6
+    st1             {v16.s}0, x2, x3
+    st1             {v16.s}1, x2, x3
+
+    sub             x9, x9, #2
+    cbnz            x9, .loop_4x\h
+    ret
+endfunc
+.endm
+
+LUMA_VPP_4xN 4
+LUMA_VPP_4xN 8
+LUMA_VPP_4xN 16
+
+// void interp_vert_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
+.macro LUMA_VPP w, h
+function x265_interp_8tap_vert_pp_\w\()x\h\()_neon
+    cmp             x4, #0
+    b.eq            0f
+    cmp             x4, #1
+    b.eq            1f
+    cmp             x4, #2
+    b.eq            2f
+    cmp             x4, #3
+    b.eq            3f
+0:
+    FILTER_LUMA_VPP \w, \h, 0
+1:
+    FILTER_LUMA_VPP \w, \h, 1
+2:
+    FILTER_LUMA_VPP \w, \h, 2
+3:
+    FILTER_LUMA_VPP \w, \h, 3
+endfunc
+.endm
+
+LUMA_VPP 8, 4
+LUMA_VPP 8, 8
+LUMA_VPP 8, 16
+LUMA_VPP 8, 32
+LUMA_VPP 12, 16
+LUMA_VPP 16, 4
+LUMA_VPP 16, 8
+LUMA_VPP 16, 16
+LUMA_VPP 16, 32
+LUMA_VPP 16, 64
+LUMA_VPP 16, 12
+LUMA_VPP 24, 32
+LUMA_VPP 32, 8
+LUMA_VPP 32, 16
+LUMA_VPP 32, 32
+LUMA_VPP 32, 64
+LUMA_VPP 32, 24
+LUMA_VPP 48, 64
+LUMA_VPP 64, 16
+LUMA_VPP 64, 32
+LUMA_VPP 64, 64
+LUMA_VPP 64, 48
+
+// ***** luma_vps *****
+// void interp_vert_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
+.macro LUMA_VPS_4xN h
+function x265_interp_8tap_vert_ps_4x\h\()_neon
+    lsl             x3, x3, #1
+    lsl             x5, x4, #6
+    lsl             x4, x1, #2
+    sub             x4, x4, x1
+    sub             x0, x0, x4
+
+    mov             w6, #8192
+    dup             v28.4s, w6
+    mov             x4, #\h
+    movrel          x12, g_lumaFilter
+    add             x12, x12, x5
+    ld1r            {v16.2d}, x12, #8
+    ld1r            {v17.2d}, x12, #8
+    ld1r            {v18.2d}, x12, #8
+    ld1r            {v19.2d}, x12, #8
​

x265_3.6.tar.gz/source/common/aarch64/loopfilter-prim.cpp Added

@@ -0,0 +1,291 @@
+#include "loopfilter-prim.h"
+
+#define PIXEL_MIN 0
+
+
+
+#if !(HIGH_BIT_DEPTH) && defined(HAVE_NEON)
+#include<arm_neon.h>
+
+namespace
+{
+
+
+/* get the sign of input variable (TODO: this is a dup, make common) */
+static inline int8_t signOf(int x)
+{
+    return (x >> 31) | ((int)((((uint32_t) - x)) >> 31));
+}
+
+static inline int8x8_t sign_diff_neon(const uint8x8_t in0, const uint8x8_t in1)
+{
+    int16x8_t in = vsubl_u8(in0, in1);
+    return vmovn_s16(vmaxq_s16(vminq_s16(in, vdupq_n_s16(1)), vdupq_n_s16(-1)));
+}
+
+static void calSign_neon(int8_t *dst, const pixel *src1, const pixel *src2, const int endX)
+{
+    int x = 0;
+    for (; (x + 8) <= endX; x += 8)
+    {
+        *(int8x8_t *)&dstx  = sign_diff_neon(*(uint8x8_t *)&src1x, *(uint8x8_t *)&src2x);
+    }
+
+    for (; x < endX; x++)
+    {
+        dstx = signOf(src1x - src2x);
+    }
+}
+
+static void processSaoCUE0_neon(pixel *rec, int8_t *offsetEo, int width, int8_t *signLeft, intptr_t stride)
+{
+
+
+    int y;
+    int8_t signRight, signLeft0;
+    int8_t edgeType;
+
+    for (y = 0; y < 2; y++)
+    {
+        signLeft0 = signLefty;
+        int x = 0;
+
+        if (width >= 8)
+        {
+            int8x8_t vsignRight;
+            int8x8x2_t shifter;
+            shifter.val10 = signLeft0;
+            static const int8x8_t index = {8, 0, 1, 2, 3, 4, 5, 6};
+            int8x8_t tbl = *(int8x8_t *)offsetEo;
+            for (; (x + 8) <= width; x += 8)
+            {
+                uint8x8_t in = *(uint8x8_t *)&recx;
+                vsignRight = sign_diff_neon(in, *(uint8x8_t *)&recx + 1);
+                shifter.val0 = vneg_s8(vsignRight);
+                int8x8_t tmp = shifter.val0;
+                int8x8_t edge = vtbl2_s8(shifter, index);
+                int8x8_t vedgeType = vadd_s8(vadd_s8(vsignRight, edge), vdup_n_s8(2));
+                shifter.val10 = tmp7;
+                int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType));
+                t1 = vaddw_u8(t1, in);
+                t1 = vmaxq_s16(t1, vdupq_n_s16(0));
+                t1 = vminq_s16(t1, vdupq_n_s16(255));
+                *(uint8x8_t *)&recx = vmovn_u16(t1);
+            }
+            signLeft0 = shifter.val10;
+        }
+        for (; x < width; x++)
+        {
+            signRight = ((recx - recx + 1) < 0) ? -1 : ((recx - recx + 1) > 0) ? 1 : 0;
+            edgeType = signRight + signLeft0 + 2;
+            signLeft0 = -signRight;
+            recx = x265_clip(recx + offsetEoedgeType);
+        }
+        rec += stride;
+    }
+}
+
+static void processSaoCUE1_neon(pixel *rec, int8_t *upBuff1, int8_t *offsetEo, intptr_t stride, int width)
+{
+    int x = 0;
+    int8_t signDown;
+    int edgeType;
+
+    if (width >= 8)
+    {
+        int8x8_t tbl = *(int8x8_t *)offsetEo;
+        for (; (x + 8) <= width; x += 8)
+        {
+            uint8x8_t in0 = *(uint8x8_t *)&recx;
+            uint8x8_t in1 = *(uint8x8_t *)&recx + stride;
+            int8x8_t vsignDown = sign_diff_neon(in0, in1);
+            int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, *(int8x8_t *)&upBuff1x), vdup_n_s8(2));
+            *(int8x8_t *)&upBuff1x = vneg_s8(vsignDown);
+            int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType));
+            t1 = vaddw_u8(t1, in0);
+            *(uint8x8_t *)&recx = vqmovun_s16(t1);
+        }
+    }
+    for (; x < width; x++)
+    {
+        signDown = signOf(recx - recx + stride);
+        edgeType = signDown + upBuff1x + 2;
+        upBuff1x = -signDown;
+        recx = x265_clip(recx + offsetEoedgeType);
+    }
+}
+
+static void processSaoCUE1_2Rows_neon(pixel *rec, int8_t *upBuff1, int8_t *offsetEo, intptr_t stride, int width)
+{
+    int y;
+    int8_t signDown;
+    int edgeType;
+
+    for (y = 0; y < 2; y++)
+    {
+        int x = 0;
+        if (width >= 8)
+        {
+            int8x8_t tbl = *(int8x8_t *)offsetEo;
+            for (; (x + 8) <= width; x += 8)
+            {
+                uint8x8_t in0 = *(uint8x8_t *)&recx;
+                uint8x8_t in1 = *(uint8x8_t *)&recx + stride;
+                int8x8_t vsignDown = sign_diff_neon(in0, in1);
+                int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, *(int8x8_t *)&upBuff1x), vdup_n_s8(2));
+                *(int8x8_t *)&upBuff1x = vneg_s8(vsignDown);
+                int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType));
+                t1 = vaddw_u8(t1, in0);
+                t1 = vmaxq_s16(t1, vdupq_n_s16(0));
+                t1 = vminq_s16(t1, vdupq_n_s16(255));
+                *(uint8x8_t *)&recx = vmovn_u16(t1);
+
+            }
+        }
+        for (; x < width; x++)
+        {
+            signDown = signOf(recx - recx + stride);
+            edgeType = signDown + upBuff1x + 2;
+            upBuff1x = -signDown;
+            recx = x265_clip(recx + offsetEoedgeType);
+        }
+        rec += stride;
+    }
+}
+
+static void processSaoCUE2_neon(pixel *rec, int8_t *bufft, int8_t *buff1, int8_t *offsetEo, int width, intptr_t stride)
+{
+    int x;
+
+    if (abs(buff1 - bufft) < 16)
+    {
+        for (x = 0; x < width; x++)
+        {
+            int8_t signDown = signOf(recx - recx + stride + 1);
+            int edgeType = signDown + buff1x + 2;
+            bufftx + 1 = -signDown;
+            recx = x265_clip(recx + offsetEoedgeType);;
+        }
+    }
+    else
+    {
+        int8x8_t tbl = *(int8x8_t *)offsetEo;
+        x = 0;
+        for (; (x + 8) <= width; x += 8)
+        {
+            uint8x8_t in0 = *(uint8x8_t *)&recx;
+            uint8x8_t in1 = *(uint8x8_t *)&recx + stride + 1;
+            int8x8_t vsignDown = sign_diff_neon(in0, in1);
+            int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, *(int8x8_t *)&buff1x), vdup_n_s8(2));
+            *(int8x8_t *)&bufftx + 1 = vneg_s8(vsignDown);
+            int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType));
+            t1 = vaddw_u8(t1, in0);
+            t1 = vmaxq_s16(t1, vdupq_n_s16(0));
+            t1 = vminq_s16(t1, vdupq_n_s16(255));
+            *(uint8x8_t *)&recx = vmovn_u16(t1);
+        }
+        for (; x < width; x++)
+        {
+            int8_t signDown = signOf(recx - recx + stride + 1);
+            int edgeType = signDown + buff1x + 2;
+            bufftx + 1 = -signDown;
+            recx = x265_clip(recx + offsetEoedgeType);;
+        }
+
+    }
+}
+
+
+static void processSaoCUE3_neon(pixel *rec, int8_t *upBuff1, int8_t *offsetEo, intptr_t stride, int startX, int endX)

 
@@ -0,0 +1,291 @@
+#include "loopfilter-prim.h"
+
+#define PIXEL_MIN 0
+
+
+
+#if !(HIGH_BIT_DEPTH) && defined(HAVE_NEON)
+#include<arm_neon.h>
+
+namespace
+{
+
+
+/* get the sign of input variable (TODO: this is a dup, make common) */
+static inline int8_t signOf(int x)
+{
+    return (x >> 31) | ((int)((((uint32_t) - x)) >> 31));
+}
+
+static inline int8x8_t sign_diff_neon(const uint8x8_t in0, const uint8x8_t in1)
+{
+    int16x8_t in = vsubl_u8(in0, in1);
+    return vmovn_s16(vmaxq_s16(vminq_s16(in, vdupq_n_s16(1)), vdupq_n_s16(-1)));
+}
+
+static void calSign_neon(int8_t *dst, const pixel *src1, const pixel *src2, const int endX)
+{
+    int x = 0;
+    for (; (x + 8) <= endX; x += 8)
+    {
+        *(int8x8_t *)&dstx  = sign_diff_neon(*(uint8x8_t *)&src1x, *(uint8x8_t *)&src2x);
+    }
+
+    for (; x < endX; x++)
+    {
+        dstx = signOf(src1x - src2x);
+    }
+}
+
+static void processSaoCUE0_neon(pixel *rec, int8_t *offsetEo, int width, int8_t *signLeft, intptr_t stride)
+{
+
+
+    int y;
+    int8_t signRight, signLeft0;
+    int8_t edgeType;
+
+    for (y = 0; y < 2; y++)
+    {
+        signLeft0 = signLefty;
+        int x = 0;
+
+        if (width >= 8)
+        {
+            int8x8_t vsignRight;
+            int8x8x2_t shifter;
+            shifter.val10 = signLeft0;
+            static const int8x8_t index = {8, 0, 1, 2, 3, 4, 5, 6};
+            int8x8_t tbl = *(int8x8_t *)offsetEo;
+            for (; (x + 8) <= width; x += 8)
+            {
+                uint8x8_t in = *(uint8x8_t *)&recx;
+                vsignRight = sign_diff_neon(in, *(uint8x8_t *)&recx + 1);
+                shifter.val0 = vneg_s8(vsignRight);
+                int8x8_t tmp = shifter.val0;
+                int8x8_t edge = vtbl2_s8(shifter, index);
+                int8x8_t vedgeType = vadd_s8(vadd_s8(vsignRight, edge), vdup_n_s8(2));
+                shifter.val10 = tmp7;
+                int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType));
+                t1 = vaddw_u8(t1, in);
+                t1 = vmaxq_s16(t1, vdupq_n_s16(0));
+                t1 = vminq_s16(t1, vdupq_n_s16(255));
+                *(uint8x8_t *)&recx = vmovn_u16(t1);
+            }
+            signLeft0 = shifter.val10;
+        }
+        for (; x < width; x++)
+        {
+            signRight = ((recx - recx + 1) < 0) ? -1 : ((recx - recx + 1) > 0) ? 1 : 0;
+            edgeType = signRight + signLeft0 + 2;
+            signLeft0 = -signRight;
+            recx = x265_clip(recx + offsetEoedgeType);
+        }
+        rec += stride;
+    }
+}
+
+static void processSaoCUE1_neon(pixel *rec, int8_t *upBuff1, int8_t *offsetEo, intptr_t stride, int width)
+{
+    int x = 0;
+    int8_t signDown;
+    int edgeType;
+
+    if (width >= 8)
+    {
+        int8x8_t tbl = *(int8x8_t *)offsetEo;
+        for (; (x + 8) <= width; x += 8)
+        {
+            uint8x8_t in0 = *(uint8x8_t *)&recx;
+            uint8x8_t in1 = *(uint8x8_t *)&recx + stride;
+            int8x8_t vsignDown = sign_diff_neon(in0, in1);
+            int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, *(int8x8_t *)&upBuff1x), vdup_n_s8(2));
+            *(int8x8_t *)&upBuff1x = vneg_s8(vsignDown);
+            int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType));
+            t1 = vaddw_u8(t1, in0);
+            *(uint8x8_t *)&recx = vqmovun_s16(t1);
+        }
+    }
+    for (; x < width; x++)
+    {
+        signDown = signOf(recx - recx + stride);
+        edgeType = signDown + upBuff1x + 2;
+        upBuff1x = -signDown;
+        recx = x265_clip(recx + offsetEoedgeType);
+    }
+}
+
+static void processSaoCUE1_2Rows_neon(pixel *rec, int8_t *upBuff1, int8_t *offsetEo, intptr_t stride, int width)
+{
+    int y;
+    int8_t signDown;
+    int edgeType;
+
+    for (y = 0; y < 2; y++)
+    {
+        int x = 0;
+        if (width >= 8)
+        {
+            int8x8_t tbl = *(int8x8_t *)offsetEo;
+            for (; (x + 8) <= width; x += 8)
+            {
+                uint8x8_t in0 = *(uint8x8_t *)&recx;
+                uint8x8_t in1 = *(uint8x8_t *)&recx + stride;
+                int8x8_t vsignDown = sign_diff_neon(in0, in1);
+                int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, *(int8x8_t *)&upBuff1x), vdup_n_s8(2));
+                *(int8x8_t *)&upBuff1x = vneg_s8(vsignDown);
+                int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType));
+                t1 = vaddw_u8(t1, in0);
+                t1 = vmaxq_s16(t1, vdupq_n_s16(0));
+                t1 = vminq_s16(t1, vdupq_n_s16(255));
+                *(uint8x8_t *)&recx = vmovn_u16(t1);
+
+            }
+        }
+        for (; x < width; x++)
+        {
+            signDown = signOf(recx - recx + stride);
+            edgeType = signDown + upBuff1x + 2;
+            upBuff1x = -signDown;
+            recx = x265_clip(recx + offsetEoedgeType);
+        }
+        rec += stride;
+    }
+}
+
+static void processSaoCUE2_neon(pixel *rec, int8_t *bufft, int8_t *buff1, int8_t *offsetEo, int width, intptr_t stride)
+{
+    int x;
+
+    if (abs(buff1 - bufft) < 16)
+    {
+        for (x = 0; x < width; x++)
+        {
+            int8_t signDown = signOf(recx - recx + stride + 1);
+            int edgeType = signDown + buff1x + 2;
+            bufftx + 1 = -signDown;
+            recx = x265_clip(recx + offsetEoedgeType);;
+        }
+    }
+    else
+    {
+        int8x8_t tbl = *(int8x8_t *)offsetEo;
+        x = 0;
+        for (; (x + 8) <= width; x += 8)
+        {
+            uint8x8_t in0 = *(uint8x8_t *)&recx;
+            uint8x8_t in1 = *(uint8x8_t *)&recx + stride + 1;
+            int8x8_t vsignDown = sign_diff_neon(in0, in1);
+            int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, *(int8x8_t *)&buff1x), vdup_n_s8(2));
+            *(int8x8_t *)&bufftx + 1 = vneg_s8(vsignDown);
+            int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType));
+            t1 = vaddw_u8(t1, in0);
+            t1 = vmaxq_s16(t1, vdupq_n_s16(0));
+            t1 = vminq_s16(t1, vdupq_n_s16(255));
+            *(uint8x8_t *)&recx = vmovn_u16(t1);
+        }
+        for (; x < width; x++)
+        {
+            int8_t signDown = signOf(recx - recx + stride + 1);
+            int edgeType = signDown + buff1x + 2;
+            bufftx + 1 = -signDown;
+            recx = x265_clip(recx + offsetEoedgeType);;
+        }
+
+    }
+}
+
+
+static void processSaoCUE3_neon(pixel *rec, int8_t *upBuff1, int8_t *offsetEo, intptr_t stride, int startX, int endX)
​

x265_3.6.tar.gz/source/common/aarch64/loopfilter-prim.h Added

 
@@ -0,0 +1,16 @@
+#ifndef _LOOPFILTER_NEON_H__
+#define _LOOPFILTER_NEON_H__
+
+#include "common.h"
+#include "primitives.h"
+
+#define PIXEL_MIN 0
+
+namespace X265_NS
+{
+void setupLoopFilterPrimitives_neon(EncoderPrimitives &p);
+
+};
+
+
+#endif
​

x265_3.6.tar.gz/source/common/aarch64/mc-a-common.S Added

@@ -0,0 +1,48 @@
+/*****************************************************************************
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
+ *
+ * Authors: David Chen <david.chen@myais.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+// This file contains the macros written using NEON instruction set
+// that are also used by the SVE2 functions
+
+.arch           armv8-a
+
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
+.section .rodata
+#endif
+
+.macro addAvg_start
+    lsl             x3, x3, #1
+    lsl             x4, x4, #1
+    mov             w11, #0x40
+    dup             v30.16b, w11
+.endm
+
+.macro addavg_1 v0, v1
+    add             \v0\().8h, \v0\().8h, \v1\().8h
+    saddl           v16.4s, \v0\().4h, v30.4h
+    saddl2          v17.4s, \v0\().8h, v30.8h
+    shrn            \v0\().4h, v16.4s, #7
+    shrn2           \v0\().8h, v17.4s, #7
+.endm

 
@@ -0,0 +1,48 @@
+/*****************************************************************************
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
+ *
+ * Authors: David Chen <david.chen@myais.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+// This file contains the macros written using NEON instruction set
+// that are also used by the SVE2 functions
+
+.arch           armv8-a
+
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
+.section .rodata
+#endif
+
+.macro addAvg_start
+    lsl             x3, x3, #1
+    lsl             x4, x4, #1
+    mov             w11, #0x40
+    dup             v30.16b, w11
+.endm
+
+.macro addavg_1 v0, v1
+    add             \v0\().8h, \v0\().8h, \v1\().8h
+    saddl           v16.4s, \v0\().4h, v30.4h
+    saddl2          v17.4s, \v0\().8h, v30.8h
+    shrn            \v0\().4h, v16.4s, #7
+    shrn2           \v0\().8h, v17.4s, #7
+.endm
​

x265_3.6.tar.gz/source/common/aarch64/mc-a-sve2.S Added

@@ -0,0 +1,924 @@
+/*****************************************************************************
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
+ *
+ * Authors: David Chen <david.chen@myais.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm-sve.S"
+#include "mc-a-common.S"
+
+.arch armv8-a+sve2
+
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
+.section .rodata
+#endif
+
+.align 4
+
+.text
+
+function PFX(pixel_avg_pp_12x16_sve2)
+    sub             x1, x1, #4
+    sub             x3, x3, #4
+    sub             x5, x5, #4
+    ptrue           p0.s, vl1
+    ptrue           p1.b, vl8
+    mov             x11, #4
+.rept 16
+    ld1w            {z0.s}, p0/z, x2
+    ld1b            {z1.b}, p1/z, x2, x11
+    ld1w            {z2.s}, p0/z, x4
+    ld1b            {z3.b}, p1/z, x4, x11
+    add             x2, x2, #4
+    add             x2, x2, x3
+    add             x4, x4, #4
+    add             x4, x4, x5
+    urhadd          z0.b, p1/m, z0.b, z2.b
+    urhadd          z1.b, p1/m, z1.b, z3.b
+    st1b            {z0.b}, p1, x0
+    st1b            {z1.b}, p1, x0, x11
+    add             x0, x0, #4
+    add             x0, x0, x1
+.endr
+    ret
+endfunc
+
+function PFX(pixel_avg_pp_24x32_sve2)
+    mov             w12, #4
+    rdvl            x9, #1
+    cmp             x9, #16
+    bgt             .vl_gt_16_pixel_avg_pp_24x32
+    sub             x1, x1, #16
+    sub             x3, x3, #16
+    sub             x5, x5, #16
+.lpavg_24x32_sve2:
+    sub             w12, w12, #1
+.rept 8
+    ld1             {v0.16b}, x2, #16
+    ld1             {v1.8b}, x2, x3
+    ld1             {v2.16b}, x4, #16
+    ld1             {v3.8b}, x4, x5
+    urhadd          v0.16b, v0.16b, v2.16b
+    urhadd          v1.8b, v1.8b, v3.8b
+    st1             {v0.16b}, x0, #16
+    st1             {v1.8b}, x0, x1
+.endr
+    cbnz            w12, .lpavg_24x32_sve2
+    ret
+.vl_gt_16_pixel_avg_pp_24x32:
+    mov             x10, #24
+    mov             x11, #0
+    whilelt         p0.b, x11, x10
+.vl_gt_16_loop_pixel_avg_pp_24x32:
+    sub             w12, w12, #1
+.rept 8
+    ld1b            {z0.b}, p0/z, x2
+    ld1b            {z2.b}, p0/z, x4
+    add             x2, x2, x3
+    add             x4, x4, x5
+    urhadd          z0.b, p0/m, z0.b, z2.b
+    st1b            {z0.b}, p0, x0
+    add             x0, x0, x1
+.endr
+    cbnz            w12, .vl_gt_16_loop_pixel_avg_pp_24x32
+    ret
+endfunc
+
+.macro pixel_avg_pp_32xN_sve2 h
+function PFX(pixel_avg_pp_32x\h\()_sve2)
+    rdvl            x9, #1
+    cmp             x9, #16
+    bgt             .vl_gt_16_pixel_avg_pp_32_\h
+.rept \h
+    ld1             {v0.16b-v1.16b}, x2, x3
+    ld1             {v2.16b-v3.16b}, x4, x5
+    urhadd          v0.16b, v0.16b, v2.16b
+    urhadd          v1.16b, v1.16b, v3.16b
+    st1             {v0.16b-v1.16b}, x0, x1
+.endr
+    ret
+.vl_gt_16_pixel_avg_pp_32_\h:
+    ptrue           p0.b, vl32
+.rept \h
+    ld1b            {z0.b}, p0/z, x2
+    ld1b            {z2.b}, p0/z, x4
+    add             x2, x2, x3
+    add             x4, x4, x5
+    urhadd          z0.b, p0/m, z0.b, z2.b
+    st1b            {z0.b}, p0, x0
+    add             x0, x0, x1
+.endr
+    ret
+endfunc
+.endm
+
+pixel_avg_pp_32xN_sve2 8
+pixel_avg_pp_32xN_sve2 16
+pixel_avg_pp_32xN_sve2 24
+
+.macro pixel_avg_pp_32xN1_sve2 h
+function PFX(pixel_avg_pp_32x\h\()_sve2)
+    rdvl            x9, #1
+    cmp             x9, #16
+    bgt             .vl_gt_16_pixel_avg_pp_32xN1_\h
+    mov             w12, #\h / 8
+.lpavg_sve2_32x\h\():
+    sub             w12, w12, #1
+.rept 8
+    ld1             {v0.16b-v1.16b}, x2, x3
+    ld1             {v2.16b-v3.16b}, x4, x5
+    urhadd          v0.16b, v0.16b, v2.16b
+    urhadd          v1.16b, v1.16b, v3.16b
+    st1             {v0.16b-v1.16b}, x0, x1
+.endr
+    cbnz            w12, .lpavg_sve2_32x\h
+    ret
+.vl_gt_16_pixel_avg_pp_32xN1_\h:
+    ptrue           p0.b, vl32
+    mov             w12, #\h / 8
+.eq_32_loop_pixel_avg_pp_32xN1_\h\():
+    sub             w12, w12, #1
+.rept 8
+    ld1b            {z0.b}, p0/z, x2
+    ld1b            {z2.b}, p0/z, x4
+    add             x2, x2, x3
+    add             x4, x4, x5
+    urhadd          z0.b, p0/m, z0.b, z2.b
+    st1b            {z0.b}, p0, x0
+    add             x0, x0, x1
+.endr
+    cbnz            w12, .eq_32_loop_pixel_avg_pp_32xN1_\h
+    ret
+endfunc
+.endm
+
+pixel_avg_pp_32xN1_sve2 32
+pixel_avg_pp_32xN1_sve2 64
+
+function PFX(pixel_avg_pp_48x64_sve2)
+    rdvl            x9, #1
+    cmp             x9, #16
+    bgt             .vl_gt_16_pixel_avg_pp_48x64
+    mov             w12, #8
+.lpavg_48x64_sve2:
+    sub             w12, w12, #1
+.rept 8
+    ld1             {v0.16b-v2.16b}, x2, x3
+    ld1             {v3.16b-v5.16b}, x4, x5
+    urhadd          v0.16b, v0.16b, v3.16b
+    urhadd          v1.16b, v1.16b, v4.16b
+    urhadd          v2.16b, v2.16b, v5.16b
+    st1             {v0.16b-v2.16b}, x0, x1
+.endr
+    cbnz            w12, .lpavg_48x64_sve2
+    ret
+.vl_gt_16_pixel_avg_pp_48x64:
+    cmp             x9, #32
+    bgt             .vl_gt_32_pixel_avg_pp_48x64
+    ptrue           p0.b, vl32
+    ptrue           p1.b, vl16
+    mov             w12, #8

 
@@ -0,0 +1,924 @@
+/*****************************************************************************
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
+ *
+ * Authors: David Chen <david.chen@myais.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm-sve.S"
+#include "mc-a-common.S"
+
+.arch armv8-a+sve2
+
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
+.section .rodata
+#endif
+
+.align 4
+
+.text
+
+function PFX(pixel_avg_pp_12x16_sve2)
+    sub             x1, x1, #4
+    sub             x3, x3, #4
+    sub             x5, x5, #4
+    ptrue           p0.s, vl1
+    ptrue           p1.b, vl8
+    mov             x11, #4
+.rept 16
+    ld1w            {z0.s}, p0/z, x2
+    ld1b            {z1.b}, p1/z, x2, x11
+    ld1w            {z2.s}, p0/z, x4
+    ld1b            {z3.b}, p1/z, x4, x11
+    add             x2, x2, #4
+    add             x2, x2, x3
+    add             x4, x4, #4
+    add             x4, x4, x5
+    urhadd          z0.b, p1/m, z0.b, z2.b
+    urhadd          z1.b, p1/m, z1.b, z3.b
+    st1b            {z0.b}, p1, x0
+    st1b            {z1.b}, p1, x0, x11
+    add             x0, x0, #4
+    add             x0, x0, x1
+.endr
+    ret
+endfunc
+
+function PFX(pixel_avg_pp_24x32_sve2)
+    mov             w12, #4
+    rdvl            x9, #1
+    cmp             x9, #16
+    bgt             .vl_gt_16_pixel_avg_pp_24x32
+    sub             x1, x1, #16
+    sub             x3, x3, #16
+    sub             x5, x5, #16
+.lpavg_24x32_sve2:
+    sub             w12, w12, #1
+.rept 8
+    ld1             {v0.16b}, x2, #16
+    ld1             {v1.8b}, x2, x3
+    ld1             {v2.16b}, x4, #16
+    ld1             {v3.8b}, x4, x5
+    urhadd          v0.16b, v0.16b, v2.16b
+    urhadd          v1.8b, v1.8b, v3.8b
+    st1             {v0.16b}, x0, #16
+    st1             {v1.8b}, x0, x1
+.endr
+    cbnz            w12, .lpavg_24x32_sve2
+    ret
+.vl_gt_16_pixel_avg_pp_24x32:
+    mov             x10, #24
+    mov             x11, #0
+    whilelt         p0.b, x11, x10
+.vl_gt_16_loop_pixel_avg_pp_24x32:
+    sub             w12, w12, #1
+.rept 8
+    ld1b            {z0.b}, p0/z, x2
+    ld1b            {z2.b}, p0/z, x4
+    add             x2, x2, x3
+    add             x4, x4, x5
+    urhadd          z0.b, p0/m, z0.b, z2.b
+    st1b            {z0.b}, p0, x0
+    add             x0, x0, x1
+.endr
+    cbnz            w12, .vl_gt_16_loop_pixel_avg_pp_24x32
+    ret
+endfunc
+
+.macro pixel_avg_pp_32xN_sve2 h
+function PFX(pixel_avg_pp_32x\h\()_sve2)
+    rdvl            x9, #1
+    cmp             x9, #16
+    bgt             .vl_gt_16_pixel_avg_pp_32_\h
+.rept \h
+    ld1             {v0.16b-v1.16b}, x2, x3
+    ld1             {v2.16b-v3.16b}, x4, x5
+    urhadd          v0.16b, v0.16b, v2.16b
+    urhadd          v1.16b, v1.16b, v3.16b
+    st1             {v0.16b-v1.16b}, x0, x1
+.endr
+    ret
+.vl_gt_16_pixel_avg_pp_32_\h:
+    ptrue           p0.b, vl32
+.rept \h
+    ld1b            {z0.b}, p0/z, x2
+    ld1b            {z2.b}, p0/z, x4
+    add             x2, x2, x3
+    add             x4, x4, x5
+    urhadd          z0.b, p0/m, z0.b, z2.b
+    st1b            {z0.b}, p0, x0
+    add             x0, x0, x1
+.endr
+    ret
+endfunc
+.endm
+
+pixel_avg_pp_32xN_sve2 8
+pixel_avg_pp_32xN_sve2 16
+pixel_avg_pp_32xN_sve2 24
+
+.macro pixel_avg_pp_32xN1_sve2 h
+function PFX(pixel_avg_pp_32x\h\()_sve2)
+    rdvl            x9, #1
+    cmp             x9, #16
+    bgt             .vl_gt_16_pixel_avg_pp_32xN1_\h
+    mov             w12, #\h / 8
+.lpavg_sve2_32x\h\():
+    sub             w12, w12, #1
+.rept 8
+    ld1             {v0.16b-v1.16b}, x2, x3
+    ld1             {v2.16b-v3.16b}, x4, x5
+    urhadd          v0.16b, v0.16b, v2.16b
+    urhadd          v1.16b, v1.16b, v3.16b
+    st1             {v0.16b-v1.16b}, x0, x1
+.endr
+    cbnz            w12, .lpavg_sve2_32x\h
+    ret
+.vl_gt_16_pixel_avg_pp_32xN1_\h:
+    ptrue           p0.b, vl32
+    mov             w12, #\h / 8
+.eq_32_loop_pixel_avg_pp_32xN1_\h\():
+    sub             w12, w12, #1
+.rept 8
+    ld1b            {z0.b}, p0/z, x2
+    ld1b            {z2.b}, p0/z, x4
+    add             x2, x2, x3
+    add             x4, x4, x5
+    urhadd          z0.b, p0/m, z0.b, z2.b
+    st1b            {z0.b}, p0, x0
+    add             x0, x0, x1
+.endr
+    cbnz            w12, .eq_32_loop_pixel_avg_pp_32xN1_\h
+    ret
+endfunc
+.endm
+
+pixel_avg_pp_32xN1_sve2 32
+pixel_avg_pp_32xN1_sve2 64
+
+function PFX(pixel_avg_pp_48x64_sve2)
+    rdvl            x9, #1
+    cmp             x9, #16
+    bgt             .vl_gt_16_pixel_avg_pp_48x64
+    mov             w12, #8
+.lpavg_48x64_sve2:
+    sub             w12, w12, #1
+.rept 8
+    ld1             {v0.16b-v2.16b}, x2, x3
+    ld1             {v3.16b-v5.16b}, x4, x5
+    urhadd          v0.16b, v0.16b, v3.16b
+    urhadd          v1.16b, v1.16b, v4.16b
+    urhadd          v2.16b, v2.16b, v5.16b
+    st1             {v0.16b-v2.16b}, x0, x1
+.endr
+    cbnz            w12, .lpavg_48x64_sve2
+    ret
+.vl_gt_16_pixel_avg_pp_48x64:
+    cmp             x9, #32
+    bgt             .vl_gt_32_pixel_avg_pp_48x64
+    ptrue           p0.b, vl32
+    ptrue           p1.b, vl16
+    mov             w12, #8
​

x265_3.5.tar.gz/source/common/aarch64/mc-a.S -> x265_3.6.tar.gz/source/common/aarch64/mc-a.S Changed

@@ -1,7 +1,8 @@
 /*****************************************************************************
- * Copyright (C) 2020 MulticoreWare, Inc
+ * Copyright (C) 2020-2021 MulticoreWare, Inc
  *
  * Authors: Hongbin Liu <liuhongbin1@huawei.com>
+ *          Sebastian Pop <spop@amazon.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -22,15 +23,20 @@
  *****************************************************************************/
 
 #include "asm.S"
+#include "mc-a-common.S"
 
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
 .section .rodata
+#endif
 
 .align 4
 
 .text
 
 .macro pixel_avg_pp_4xN_neon h
-function x265_pixel_avg_pp_4x\h\()_neon
+function PFX(pixel_avg_pp_4x\h\()_neon)
 .rept \h
     ld1             {v0.s}0, x2, x3
     ld1             {v1.s}0, x4, x5
@@ -46,7 +52,7 @@
 pixel_avg_pp_4xN_neon 16
 
 .macro pixel_avg_pp_8xN_neon h
-function x265_pixel_avg_pp_8x\h\()_neon
+function PFX(pixel_avg_pp_8x\h\()_neon)
 .rept \h
     ld1             {v0.8b}, x2, x3
     ld1             {v1.8b}, x4, x5
@@ -61,3 +67,491 @@
 pixel_avg_pp_8xN_neon 8
 pixel_avg_pp_8xN_neon 16
 pixel_avg_pp_8xN_neon 32
+
+function PFX(pixel_avg_pp_12x16_neon)
+    sub             x1, x1, #4
+    sub             x3, x3, #4
+    sub             x5, x5, #4
+.rept 16
+    ld1             {v0.s}0, x2, #4
+    ld1             {v1.8b}, x2, x3
+    ld1             {v2.s}0, x4, #4
+    ld1             {v3.8b}, x4, x5
+    urhadd          v4.8b, v0.8b, v2.8b
+    urhadd          v5.8b, v1.8b, v3.8b
+    st1             {v4.s}0, x0, #4
+    st1             {v5.8b}, x0, x1
+.endr
+    ret
+endfunc
+
+.macro pixel_avg_pp_16xN_neon h
+function PFX(pixel_avg_pp_16x\h\()_neon)
+.rept \h
+    ld1             {v0.16b}, x2, x3
+    ld1             {v1.16b}, x4, x5
+    urhadd          v2.16b, v0.16b, v1.16b
+    st1             {v2.16b}, x0, x1
+.endr
+    ret
+endfunc
+.endm
+
+pixel_avg_pp_16xN_neon 4
+pixel_avg_pp_16xN_neon 8
+pixel_avg_pp_16xN_neon 12
+pixel_avg_pp_16xN_neon 16
+pixel_avg_pp_16xN_neon 32
+
+function PFX(pixel_avg_pp_16x64_neon)
+    mov             w12, #8
+.lpavg_16x64:
+    sub             w12, w12, #1
+.rept 8
+    ld1             {v0.16b}, x2, x3
+    ld1             {v1.16b}, x4, x5
+    urhadd          v2.16b, v0.16b, v1.16b
+    st1             {v2.16b}, x0, x1
+.endr
+    cbnz            w12, .lpavg_16x64
+    ret
+endfunc
+
+function PFX(pixel_avg_pp_24x32_neon)
+    sub             x1, x1, #16
+    sub             x3, x3, #16
+    sub             x5, x5, #16
+    mov             w12, #4
+.lpavg_24x32:
+    sub             w12, w12, #1
+.rept 8
+    ld1             {v0.16b}, x2, #16
+    ld1             {v1.8b}, x2, x3
+    ld1             {v2.16b}, x4, #16
+    ld1             {v3.8b}, x4, x5
+    urhadd          v0.16b, v0.16b, v2.16b
+    urhadd          v1.8b, v1.8b, v3.8b
+    st1             {v0.16b}, x0, #16
+    st1             {v1.8b}, x0, x1
+.endr
+    cbnz            w12, .lpavg_24x32
+    ret
+endfunc
+
+.macro pixel_avg_pp_32xN_neon h
+function PFX(pixel_avg_pp_32x\h\()_neon)
+.rept \h
+    ld1             {v0.16b-v1.16b}, x2, x3
+    ld1             {v2.16b-v3.16b}, x4, x5
+    urhadd          v0.16b, v0.16b, v2.16b
+    urhadd          v1.16b, v1.16b, v3.16b
+    st1             {v0.16b-v1.16b}, x0, x1
+.endr
+    ret
+endfunc
+.endm
+
+pixel_avg_pp_32xN_neon 8
+pixel_avg_pp_32xN_neon 16
+pixel_avg_pp_32xN_neon 24
+
+.macro pixel_avg_pp_32xN1_neon h
+function PFX(pixel_avg_pp_32x\h\()_neon)
+    mov             w12, #\h / 8
+.lpavg_32x\h\():
+    sub             w12, w12, #1
+.rept 8
+    ld1             {v0.16b-v1.16b}, x2, x3
+    ld1             {v2.16b-v3.16b}, x4, x5
+    urhadd          v0.16b, v0.16b, v2.16b
+    urhadd          v1.16b, v1.16b, v3.16b
+    st1             {v0.16b-v1.16b}, x0, x1
+.endr
+    cbnz            w12, .lpavg_32x\h
+    ret
+endfunc
+.endm
+
+pixel_avg_pp_32xN1_neon 32
+pixel_avg_pp_32xN1_neon 64
+
+function PFX(pixel_avg_pp_48x64_neon)
+    mov             w12, #8
+.lpavg_48x64:
+    sub             w12, w12, #1
+.rept 8
+    ld1             {v0.16b-v2.16b}, x2, x3
+    ld1             {v3.16b-v5.16b}, x4, x5
+    urhadd          v0.16b, v0.16b, v3.16b
+    urhadd          v1.16b, v1.16b, v4.16b
+    urhadd          v2.16b, v2.16b, v5.16b
+    st1             {v0.16b-v2.16b}, x0, x1
+.endr
+    cbnz            w12, .lpavg_48x64
+    ret
+endfunc
+
+.macro pixel_avg_pp_64xN_neon h
+function PFX(pixel_avg_pp_64x\h\()_neon)
+    mov             w12, #\h / 4
+.lpavg_64x\h\():
+    sub             w12, w12, #1
+.rept 4
+    ld1             {v0.16b-v3.16b}, x2, x3
+    ld1             {v4.16b-v7.16b}, x4, x5
+    urhadd          v0.16b, v0.16b, v4.16b
+    urhadd          v1.16b, v1.16b, v5.16b
+    urhadd          v2.16b, v2.16b, v6.16b
+    urhadd          v3.16b, v3.16b, v7.16b
+    st1             {v0.16b-v3.16b}, x0, x1
+.endr
+    cbnz            w12, .lpavg_64x\h
+    ret
+endfunc
+.endm
+
+pixel_avg_pp_64xN_neon 16
+pixel_avg_pp_64xN_neon 32
+pixel_avg_pp_64xN_neon 48
+pixel_avg_pp_64xN_neon 64
+
+// void addAvg(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
+.macro addAvg_2xN h
+function PFX(addAvg_2x\h\()_neon)
+    addAvg_start
+.rept \h / 2
+    ldr             w10, x0
+    ldr             w11, x1

 
@@ -1,7 +1,8 @@
 /*****************************************************************************
- * Copyright (C) 2020 MulticoreWare, Inc
+ * Copyright (C) 2020-2021 MulticoreWare, Inc
  *
  * Authors: Hongbin Liu <liuhongbin1@huawei.com>
+ *          Sebastian Pop <spop@amazon.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -22,15 +23,20 @@
  *****************************************************************************/
 
 #include "asm.S"
+#include "mc-a-common.S"
 
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
 .section .rodata
+#endif
 
 .align 4
 
 .text
 
 .macro pixel_avg_pp_4xN_neon h
-function x265_pixel_avg_pp_4x\h\()_neon
+function PFX(pixel_avg_pp_4x\h\()_neon)
 .rept \h
     ld1             {v0.s}0, x2, x3
     ld1             {v1.s}0, x4, x5
@@ -46,7 +52,7 @@
 pixel_avg_pp_4xN_neon 16
 
 .macro pixel_avg_pp_8xN_neon h
-function x265_pixel_avg_pp_8x\h\()_neon
+function PFX(pixel_avg_pp_8x\h\()_neon)
 .rept \h
     ld1             {v0.8b}, x2, x3
     ld1             {v1.8b}, x4, x5
@@ -61,3 +67,491 @@
 pixel_avg_pp_8xN_neon 8
 pixel_avg_pp_8xN_neon 16
 pixel_avg_pp_8xN_neon 32
+
+function PFX(pixel_avg_pp_12x16_neon)
+    sub             x1, x1, #4
+    sub             x3, x3, #4
+    sub             x5, x5, #4
+.rept 16
+    ld1             {v0.s}0, x2, #4
+    ld1             {v1.8b}, x2, x3
+    ld1             {v2.s}0, x4, #4
+    ld1             {v3.8b}, x4, x5
+    urhadd          v4.8b, v0.8b, v2.8b
+    urhadd          v5.8b, v1.8b, v3.8b
+    st1             {v4.s}0, x0, #4
+    st1             {v5.8b}, x0, x1
+.endr
+    ret
+endfunc
+
+.macro pixel_avg_pp_16xN_neon h
+function PFX(pixel_avg_pp_16x\h\()_neon)
+.rept \h
+    ld1             {v0.16b}, x2, x3
+    ld1             {v1.16b}, x4, x5
+    urhadd          v2.16b, v0.16b, v1.16b
+    st1             {v2.16b}, x0, x1
+.endr
+    ret
+endfunc
+.endm
+
+pixel_avg_pp_16xN_neon 4
+pixel_avg_pp_16xN_neon 8
+pixel_avg_pp_16xN_neon 12
+pixel_avg_pp_16xN_neon 16
+pixel_avg_pp_16xN_neon 32
+
+function PFX(pixel_avg_pp_16x64_neon)
+    mov             w12, #8
+.lpavg_16x64:
+    sub             w12, w12, #1
+.rept 8
+    ld1             {v0.16b}, x2, x3
+    ld1             {v1.16b}, x4, x5
+    urhadd          v2.16b, v0.16b, v1.16b
+    st1             {v2.16b}, x0, x1
+.endr
+    cbnz            w12, .lpavg_16x64
+    ret
+endfunc
+
+function PFX(pixel_avg_pp_24x32_neon)
+    sub             x1, x1, #16
+    sub             x3, x3, #16
+    sub             x5, x5, #16
+    mov             w12, #4
+.lpavg_24x32:
+    sub             w12, w12, #1
+.rept 8
+    ld1             {v0.16b}, x2, #16
+    ld1             {v1.8b}, x2, x3
+    ld1             {v2.16b}, x4, #16
+    ld1             {v3.8b}, x4, x5
+    urhadd          v0.16b, v0.16b, v2.16b
+    urhadd          v1.8b, v1.8b, v3.8b
+    st1             {v0.16b}, x0, #16
+    st1             {v1.8b}, x0, x1
+.endr
+    cbnz            w12, .lpavg_24x32
+    ret
+endfunc
+
+.macro pixel_avg_pp_32xN_neon h
+function PFX(pixel_avg_pp_32x\h\()_neon)
+.rept \h
+    ld1             {v0.16b-v1.16b}, x2, x3
+    ld1             {v2.16b-v3.16b}, x4, x5
+    urhadd          v0.16b, v0.16b, v2.16b
+    urhadd          v1.16b, v1.16b, v3.16b
+    st1             {v0.16b-v1.16b}, x0, x1
+.endr
+    ret
+endfunc
+.endm
+
+pixel_avg_pp_32xN_neon 8
+pixel_avg_pp_32xN_neon 16
+pixel_avg_pp_32xN_neon 24
+
+.macro pixel_avg_pp_32xN1_neon h
+function PFX(pixel_avg_pp_32x\h\()_neon)
+    mov             w12, #\h / 8
+.lpavg_32x\h\():
+    sub             w12, w12, #1
+.rept 8
+    ld1             {v0.16b-v1.16b}, x2, x3
+    ld1             {v2.16b-v3.16b}, x4, x5
+    urhadd          v0.16b, v0.16b, v2.16b
+    urhadd          v1.16b, v1.16b, v3.16b
+    st1             {v0.16b-v1.16b}, x0, x1
+.endr
+    cbnz            w12, .lpavg_32x\h
+    ret
+endfunc
+.endm
+
+pixel_avg_pp_32xN1_neon 32
+pixel_avg_pp_32xN1_neon 64
+
+function PFX(pixel_avg_pp_48x64_neon)
+    mov             w12, #8
+.lpavg_48x64:
+    sub             w12, w12, #1
+.rept 8
+    ld1             {v0.16b-v2.16b}, x2, x3
+    ld1             {v3.16b-v5.16b}, x4, x5
+    urhadd          v0.16b, v0.16b, v3.16b
+    urhadd          v1.16b, v1.16b, v4.16b
+    urhadd          v2.16b, v2.16b, v5.16b
+    st1             {v0.16b-v2.16b}, x0, x1
+.endr
+    cbnz            w12, .lpavg_48x64
+    ret
+endfunc
+
+.macro pixel_avg_pp_64xN_neon h
+function PFX(pixel_avg_pp_64x\h\()_neon)
+    mov             w12, #\h / 4
+.lpavg_64x\h\():
+    sub             w12, w12, #1
+.rept 4
+    ld1             {v0.16b-v3.16b}, x2, x3
+    ld1             {v4.16b-v7.16b}, x4, x5
+    urhadd          v0.16b, v0.16b, v4.16b
+    urhadd          v1.16b, v1.16b, v5.16b
+    urhadd          v2.16b, v2.16b, v6.16b
+    urhadd          v3.16b, v3.16b, v7.16b
+    st1             {v0.16b-v3.16b}, x0, x1
+.endr
+    cbnz            w12, .lpavg_64x\h
+    ret
+endfunc
+.endm
+
+pixel_avg_pp_64xN_neon 16
+pixel_avg_pp_64xN_neon 32
+pixel_avg_pp_64xN_neon 48
+pixel_avg_pp_64xN_neon 64
+
+// void addAvg(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
+.macro addAvg_2xN h
+function PFX(addAvg_2x\h\()_neon)
+    addAvg_start
+.rept \h / 2
+    ldr             w10, x0
+    ldr             w11, x1
​

x265_3.6.tar.gz/source/common/aarch64/p2s-common.S Added

@@ -0,0 +1,102 @@
+/*****************************************************************************
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
+ *
+ * Authors: David Chen <david.chen@myais.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+// This file contains the macros written using NEON instruction set
+// that are also used by the SVE2 functions
+
+.arch           armv8-a
+
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
+.section .rodata
+#endif
+
+.align 4
+
+#if HIGH_BIT_DEPTH
+# if BIT_DEPTH == 10
+#  define P2S_SHIFT 4
+# elif BIT_DEPTH == 12
+#  define P2S_SHIFT 2
+# endif
+.macro p2s_start
+    add             x3, x3, x3
+    add             x1, x1, x1
+    movi            v31.8h, #0xe0, lsl #8
+.endm
+
+#else // if !HIGH_BIT_DEPTH
+# define P2S_SHIFT 6
+.macro p2s_start
+    add             x3, x3, x3
+    movi            v31.8h, #0xe0, lsl #8
+.endm
+#endif // HIGH_BIT_DEPTH
+
+.macro p2s_2x2
+#if HIGH_BIT_DEPTH
+    ld1             {v0.s}0, x0, x1
+    ld1             {v0.s}1, x0, x1
+    shl             v3.8h, v0.8h, #P2S_SHIFT
+#else
+    ldrh            w10, x0
+    add             x0, x0, x1
+    ldrh            w11, x0
+    orr             w10, w10, w11, lsl #16
+    add             x0, x0, x1
+    dup             v0.4s, w10
+    ushll           v3.8h, v0.8b, #P2S_SHIFT
+#endif
+    add             v3.8h, v3.8h, v31.8h
+    st1             {v3.s}0, x2, x3
+    st1             {v3.s}1, x2, x3
+.endm
+
+.macro p2s_6x2
+#if HIGH_BIT_DEPTH
+    ld1             {v0.d}0, x0, #8
+    ld1             {v1.s}0, x0, x1
+    ld1             {v0.d}1, x0, #8
+    ld1             {v1.s}1, x0, x1
+    shl             v3.8h, v0.8h, #P2S_SHIFT
+    shl             v4.8h, v1.8h, #P2S_SHIFT
+#else
+    ldr             s0, x0
+    ldrh            w10, x0, #4
+    add             x0, x0, x1
+    ld1             {v0.s}1, x0
+    ldrh            w11, x0, #4
+    add             x0, x0, x1
+    orr             w10, w10, w11, lsl #16
+    dup             v1.4s, w10
+    ushll           v3.8h, v0.8b, #P2S_SHIFT
+    ushll           v4.8h, v1.8b, #P2S_SHIFT
+#endif
+    add             v3.8h, v3.8h, v31.8h
+    add             v4.8h, v4.8h, v31.8h
+    st1             {v3.d}0, x2, #8
+    st1             {v4.s}0, x2, x3
+    st1             {v3.d}1, x2, #8
+    st1             {v4.s}1, x2, x3
+.endm

 
@@ -0,0 +1,102 @@
+/*****************************************************************************
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
+ *
+ * Authors: David Chen <david.chen@myais.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+// This file contains the macros written using NEON instruction set
+// that are also used by the SVE2 functions
+
+.arch           armv8-a
+
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
+.section .rodata
+#endif
+
+.align 4
+
+#if HIGH_BIT_DEPTH
+# if BIT_DEPTH == 10
+#  define P2S_SHIFT 4
+# elif BIT_DEPTH == 12
+#  define P2S_SHIFT 2
+# endif
+.macro p2s_start
+    add             x3, x3, x3
+    add             x1, x1, x1
+    movi            v31.8h, #0xe0, lsl #8
+.endm
+
+#else // if !HIGH_BIT_DEPTH
+# define P2S_SHIFT 6
+.macro p2s_start
+    add             x3, x3, x3
+    movi            v31.8h, #0xe0, lsl #8
+.endm
+#endif // HIGH_BIT_DEPTH
+
+.macro p2s_2x2
+#if HIGH_BIT_DEPTH
+    ld1             {v0.s}0, x0, x1
+    ld1             {v0.s}1, x0, x1
+    shl             v3.8h, v0.8h, #P2S_SHIFT
+#else
+    ldrh            w10, x0
+    add             x0, x0, x1
+    ldrh            w11, x0
+    orr             w10, w10, w11, lsl #16
+    add             x0, x0, x1
+    dup             v0.4s, w10
+    ushll           v3.8h, v0.8b, #P2S_SHIFT
+#endif
+    add             v3.8h, v3.8h, v31.8h
+    st1             {v3.s}0, x2, x3
+    st1             {v3.s}1, x2, x3
+.endm
+
+.macro p2s_6x2
+#if HIGH_BIT_DEPTH
+    ld1             {v0.d}0, x0, #8
+    ld1             {v1.s}0, x0, x1
+    ld1             {v0.d}1, x0, #8
+    ld1             {v1.s}1, x0, x1
+    shl             v3.8h, v0.8h, #P2S_SHIFT
+    shl             v4.8h, v1.8h, #P2S_SHIFT
+#else
+    ldr             s0, x0
+    ldrh            w10, x0, #4
+    add             x0, x0, x1
+    ld1             {v0.s}1, x0
+    ldrh            w11, x0, #4
+    add             x0, x0, x1
+    orr             w10, w10, w11, lsl #16
+    dup             v1.4s, w10
+    ushll           v3.8h, v0.8b, #P2S_SHIFT
+    ushll           v4.8h, v1.8b, #P2S_SHIFT
+#endif
+    add             v3.8h, v3.8h, v31.8h
+    add             v4.8h, v4.8h, v31.8h
+    st1             {v3.d}0, x2, #8
+    st1             {v4.s}0, x2, x3
+    st1             {v3.d}1, x2, #8
+    st1             {v4.s}1, x2, x3
+.endm
​

x265_3.6.tar.gz/source/common/aarch64/p2s-sve.S Added

@@ -0,0 +1,445 @@
+/*****************************************************************************
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
+ *
+ * Authors: David Chen <david.chen@myais.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm-sve.S"
+#include "p2s-common.S"
+
+.arch armv8-a+sve
+
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
+.section .rodata
+#endif
+
+.align 4
+
+.text
+
+#if HIGH_BIT_DEPTH
+# if BIT_DEPTH == 10
+#  define P2S_SHIFT 4
+# elif BIT_DEPTH == 12
+#  define P2S_SHIFT 2
+# endif
+
+.macro p2s_start_sve
+    add             x3, x3, x3
+    add             x1, x1, x1
+    mov             z31.h, #0xe0, lsl #8
+.endm
+
+#else // if !HIGH_BIT_DEPTH
+# define P2S_SHIFT 6
+.macro p2s_start_sve
+    add             x3, x3, x3
+    mov             z31.h, #0xe0, lsl #8
+.endm
+
+#endif // HIGH_BIT_DEPTH
+
+// filterPixelToShort(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride)
+.macro p2s_2xN_sve h
+function PFX(filterPixelToShort_2x\h\()_sve)
+    p2s_start_sve
+.rept \h / 2
+    p2s_2x2
+.endr
+    ret
+endfunc
+.endm
+
+p2s_2xN_sve 4
+p2s_2xN_sve 8
+p2s_2xN_sve 16
+
+.macro p2s_6xN_sve h
+function PFX(filterPixelToShort_6x\h\()_sve)
+    p2s_start_sve
+    sub             x3, x3, #8
+#if HIGH_BIT_DEPTH
+    sub             x1, x1, #8
+#endif
+.rept \h / 2
+    p2s_6x2
+.endr
+    ret
+endfunc
+.endm
+
+p2s_6xN_sve 8
+p2s_6xN_sve 16
+
+function PFX(filterPixelToShort_4x2_sve)
+    p2s_start_sve
+#if HIGH_BIT_DEPTH
+    ptrue           p0.h, vl8
+    index           z1.d, #0, x1
+    index           z2.d, #0, x3
+    ld1d            {z3.d}, p0/z, x0, z1.d
+    lsl             z3.h, p0/m, z3.h, #P2S_SHIFT
+    add             z3.h, p0/m, z3.h, z31.h
+    st1d            {z3.d}, p0, x2, z2.d
+#else
+    ptrue           p0.h, vl4
+    ld1b            {z0.h}, p0/z, x0
+    add             x0, x0, x1
+    ld1b            {z1.h}, p0/z, x0
+    lsl             z0.h, p0/m, z0.h, #P2S_SHIFT
+    lsl             z1.h, p0/m, z1.h, #P2S_SHIFT
+    add             z0.h, p0/m, z0.h, z31.h
+    add             z1.h, p0/m, z1.h, z31.h
+    st1h            {z0.h}, p0, x2
+    add             x2, x2, x3
+    st1h            {z1.h}, p0, x2
+#endif
+    ret
+endfunc
+
+
+.macro p2s_8xN_sve h
+function PFX(filterPixelToShort_8x\h\()_sve)
+    p2s_start_sve
+    ptrue           p0.h, vl8
+.rept \h
+#if HIGH_BIT_DEPTH
+    ld1d            {z0.d}, p0/z, x0
+    add             x0, x0, x1
+    lsl             z0.h, p0/m, z0.h, #P2S_SHIFT
+    add             z0.h, p0/m, z0.h, z31.h
+    st1h            {z0.h}, p0, x2
+    add             x2, x2, x3
+#else
+    ld1b            {z0.h}, p0/z, x0
+    add             x0, x0, x1
+    lsl             z0.h, p0/m, z0.h, #P2S_SHIFT
+    add             z0.h, p0/m, z0.h, z31.h
+    st1h            {z0.h}, p0, x2
+    add             x2, x2, x3
+#endif
+.endr
+    ret
+endfunc
+.endm
+
+p2s_8xN_sve 2
+
+.macro p2s_32xN_sve h
+function PFX(filterPixelToShort_32x\h\()_sve)
+#if HIGH_BIT_DEPTH
+    p2s_start_sve
+    rdvl            x9, #1
+    cmp             x9, #16
+    bgt             .vl_gt_16_filterPixelToShort_high_32x\h
+    ptrue           p0.h, vl8
+.rept \h
+    ld1h            {z0.h}, p0/z, x0
+    ld1h            {z1.h}, p0/z, x0, #1, mul vl
+    ld1h            {z2.h}, p0/z, x0, #2, mul vl
+    ld1h            {z3.h}, p0/z, x0, #3, mul vl
+    add             x0, x0, x1
+    lsl             z0.h, p0/m, z0.h, #P2S_SHIFT
+    lsl             z1.h, p0/m, z1.h, #P2S_SHIFT
+    lsl             z2.h, p0/m, z2.h, #P2S_SHIFT
+    lsl             z3.h, p0/m, z3.h, #P2S_SHIFT
+    add             z0.h, p0/m, z0.h, z31.h
+    add             z1.h, p0/m, z1.h, z31.h
+    add             z2.h, p0/m, z2.h, z31.h
+    add             z3.h, p0/m, z3.h, z31.h
+    st1h            {z0.h}, p0, x2
+    st1h            {z1.h}, p0, x2, #1, mul vl
+    st1h            {z2.h}, p0, x2, #2, mul vl
+    st1h            {z3.h}, p0, x2, #3, mul vl
+    add             x2, x2, x3
+.endr
+    ret
+.vl_gt_16_filterPixelToShort_high_32x\h\():
+    cmp             x9, #48
+    bgt             .vl_gt_48_filterPixelToShort_high_32x\h
+    ptrue           p0.h, vl16
+.rept \h
+    ld1h            {z0.h}, p0/z, x0
+    ld1h            {z1.h}, p0/z, x0, #1, mul vl
+    add             x0, x0, x1
+    lsl             z0.h, p0/m, z0.h, #P2S_SHIFT
+    lsl             z1.h, p0/m, z1.h, #P2S_SHIFT
+    add             z0.h, p0/m, z0.h, z31.h
+    add             z1.h, p0/m, z1.h, z31.h
+    st1h            {z0.h}, p0, x2
+    st1h            {z1.h}, p0, x2, #1, mul vl
+    add             x2, x2, x3
+.endr
+    ret
+.vl_gt_48_filterPixelToShort_high_32x\h\():
+    ptrue           p0.h, vl32
+.rept \h
+    ld1h            {z0.h}, p0/z, x0
+    add             x0, x0, x1
+    lsl             z0.h, p0/m, z0.h, #P2S_SHIFT
+    add             z0.h, p0/m, z0.h, z31.h

 
@@ -0,0 +1,445 @@
+/*****************************************************************************
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
+ *
+ * Authors: David Chen <david.chen@myais.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm-sve.S"
+#include "p2s-common.S"
+
+.arch armv8-a+sve
+
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
+.section .rodata
+#endif
+
+.align 4
+
+.text
+
+#if HIGH_BIT_DEPTH
+# if BIT_DEPTH == 10
+#  define P2S_SHIFT 4
+# elif BIT_DEPTH == 12
+#  define P2S_SHIFT 2
+# endif
+
+.macro p2s_start_sve
+    add             x3, x3, x3
+    add             x1, x1, x1
+    mov             z31.h, #0xe0, lsl #8
+.endm
+
+#else // if !HIGH_BIT_DEPTH
+# define P2S_SHIFT 6
+.macro p2s_start_sve
+    add             x3, x3, x3
+    mov             z31.h, #0xe0, lsl #8
+.endm
+
+#endif // HIGH_BIT_DEPTH
+
+// filterPixelToShort(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride)
+.macro p2s_2xN_sve h
+function PFX(filterPixelToShort_2x\h\()_sve)
+    p2s_start_sve
+.rept \h / 2
+    p2s_2x2
+.endr
+    ret
+endfunc
+.endm
+
+p2s_2xN_sve 4
+p2s_2xN_sve 8
+p2s_2xN_sve 16
+
+.macro p2s_6xN_sve h
+function PFX(filterPixelToShort_6x\h\()_sve)
+    p2s_start_sve
+    sub             x3, x3, #8
+#if HIGH_BIT_DEPTH
+    sub             x1, x1, #8
+#endif
+.rept \h / 2
+    p2s_6x2
+.endr
+    ret
+endfunc
+.endm
+
+p2s_6xN_sve 8
+p2s_6xN_sve 16
+
+function PFX(filterPixelToShort_4x2_sve)
+    p2s_start_sve
+#if HIGH_BIT_DEPTH
+    ptrue           p0.h, vl8
+    index           z1.d, #0, x1
+    index           z2.d, #0, x3
+    ld1d            {z3.d}, p0/z, x0, z1.d
+    lsl             z3.h, p0/m, z3.h, #P2S_SHIFT
+    add             z3.h, p0/m, z3.h, z31.h
+    st1d            {z3.d}, p0, x2, z2.d
+#else
+    ptrue           p0.h, vl4
+    ld1b            {z0.h}, p0/z, x0
+    add             x0, x0, x1
+    ld1b            {z1.h}, p0/z, x0
+    lsl             z0.h, p0/m, z0.h, #P2S_SHIFT
+    lsl             z1.h, p0/m, z1.h, #P2S_SHIFT
+    add             z0.h, p0/m, z0.h, z31.h
+    add             z1.h, p0/m, z1.h, z31.h
+    st1h            {z0.h}, p0, x2
+    add             x2, x2, x3
+    st1h            {z1.h}, p0, x2
+#endif
+    ret
+endfunc
+
+
+.macro p2s_8xN_sve h
+function PFX(filterPixelToShort_8x\h\()_sve)
+    p2s_start_sve
+    ptrue           p0.h, vl8
+.rept \h
+#if HIGH_BIT_DEPTH
+    ld1d            {z0.d}, p0/z, x0
+    add             x0, x0, x1
+    lsl             z0.h, p0/m, z0.h, #P2S_SHIFT
+    add             z0.h, p0/m, z0.h, z31.h
+    st1h            {z0.h}, p0, x2
+    add             x2, x2, x3
+#else
+    ld1b            {z0.h}, p0/z, x0
+    add             x0, x0, x1
+    lsl             z0.h, p0/m, z0.h, #P2S_SHIFT
+    add             z0.h, p0/m, z0.h, z31.h
+    st1h            {z0.h}, p0, x2
+    add             x2, x2, x3
+#endif
+.endr
+    ret
+endfunc
+.endm
+
+p2s_8xN_sve 2
+
+.macro p2s_32xN_sve h
+function PFX(filterPixelToShort_32x\h\()_sve)
+#if HIGH_BIT_DEPTH
+    p2s_start_sve
+    rdvl            x9, #1
+    cmp             x9, #16
+    bgt             .vl_gt_16_filterPixelToShort_high_32x\h
+    ptrue           p0.h, vl8
+.rept \h
+    ld1h            {z0.h}, p0/z, x0
+    ld1h            {z1.h}, p0/z, x0, #1, mul vl
+    ld1h            {z2.h}, p0/z, x0, #2, mul vl
+    ld1h            {z3.h}, p0/z, x0, #3, mul vl
+    add             x0, x0, x1
+    lsl             z0.h, p0/m, z0.h, #P2S_SHIFT
+    lsl             z1.h, p0/m, z1.h, #P2S_SHIFT
+    lsl             z2.h, p0/m, z2.h, #P2S_SHIFT
+    lsl             z3.h, p0/m, z3.h, #P2S_SHIFT
+    add             z0.h, p0/m, z0.h, z31.h
+    add             z1.h, p0/m, z1.h, z31.h
+    add             z2.h, p0/m, z2.h, z31.h
+    add             z3.h, p0/m, z3.h, z31.h
+    st1h            {z0.h}, p0, x2
+    st1h            {z1.h}, p0, x2, #1, mul vl
+    st1h            {z2.h}, p0, x2, #2, mul vl
+    st1h            {z3.h}, p0, x2, #3, mul vl
+    add             x2, x2, x3
+.endr
+    ret
+.vl_gt_16_filterPixelToShort_high_32x\h\():
+    cmp             x9, #48
+    bgt             .vl_gt_48_filterPixelToShort_high_32x\h
+    ptrue           p0.h, vl16
+.rept \h
+    ld1h            {z0.h}, p0/z, x0
+    ld1h            {z1.h}, p0/z, x0, #1, mul vl
+    add             x0, x0, x1
+    lsl             z0.h, p0/m, z0.h, #P2S_SHIFT
+    lsl             z1.h, p0/m, z1.h, #P2S_SHIFT
+    add             z0.h, p0/m, z0.h, z31.h
+    add             z1.h, p0/m, z1.h, z31.h
+    st1h            {z0.h}, p0, x2
+    st1h            {z1.h}, p0, x2, #1, mul vl
+    add             x2, x2, x3
+.endr
+    ret
+.vl_gt_48_filterPixelToShort_high_32x\h\():
+    ptrue           p0.h, vl32
+.rept \h
+    ld1h            {z0.h}, p0/z, x0
+    add             x0, x0, x1
+    lsl             z0.h, p0/m, z0.h, #P2S_SHIFT
+    add             z0.h, p0/m, z0.h, z31.h
​

x265_3.6.tar.gz/source/common/aarch64/p2s.S Added

@@ -0,0 +1,386 @@
+/*****************************************************************************
+ * Copyright (C) 2021 MulticoreWare, Inc
+ *
+ * Authors: Sebastian Pop <spop@amazon.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm.S"
+#include "p2s-common.S"
+
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
+.section .rodata
+#endif
+
+.align 4
+
+.text
+
+// filterPixelToShort(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride)
+.macro p2s_2xN h
+function PFX(filterPixelToShort_2x\h\()_neon)
+    p2s_start
+.rept \h / 2
+    p2s_2x2
+.endr
+    ret
+endfunc
+.endm
+
+p2s_2xN 4
+p2s_2xN 8
+p2s_2xN 16
+
+.macro p2s_6xN h
+function PFX(filterPixelToShort_6x\h\()_neon)
+    p2s_start
+    sub             x3, x3, #8
+#if HIGH_BIT_DEPTH
+    sub             x1, x1, #8
+#endif
+.rept \h / 2
+    p2s_6x2
+.endr
+    ret
+endfunc
+.endm
+
+p2s_6xN 8
+p2s_6xN 16
+
+function PFX(filterPixelToShort_4x2_neon)
+    p2s_start
+#if HIGH_BIT_DEPTH
+    ld1             {v0.d}0, x0, x1
+    ld1             {v0.d}1, x0, x1
+    shl             v3.8h, v0.8h, #P2S_SHIFT
+#else
+    ld1             {v0.s}0, x0, x1
+    ld1             {v0.s}1, x0, x1
+    ushll           v3.8h, v0.8b, #P2S_SHIFT
+#endif
+    add             v3.8h, v3.8h, v31.8h
+    st1             {v3.d}0, x2, x3
+    st1             {v3.d}1, x2, x3
+    ret
+endfunc
+
+function PFX(filterPixelToShort_4x4_neon)
+    p2s_start
+#if HIGH_BIT_DEPTH
+    ld1             {v0.d}0, x0, x1
+    ld1             {v0.d}1, x0, x1
+    shl             v3.8h, v0.8h, #P2S_SHIFT
+#else
+    ld1             {v0.s}0, x0, x1
+    ld1             {v0.s}1, x0, x1
+    ushll           v3.8h, v0.8b, #P2S_SHIFT
+#endif
+    add             v3.8h, v3.8h, v31.8h
+    st1             {v3.d}0, x2, x3
+    st1             {v3.d}1, x2, x3
+#if HIGH_BIT_DEPTH
+    ld1             {v1.d}0, x0, x1
+    ld1             {v1.d}1, x0, x1
+    shl             v4.8h, v1.8h, #P2S_SHIFT
+#else
+    ld1             {v1.s}0, x0, x1
+    ld1             {v1.s}1, x0, x1
+    ushll           v4.8h, v1.8b, #P2S_SHIFT
+#endif
+    add             v4.8h, v4.8h, v31.8h
+    st1             {v4.d}0, x2, x3
+    st1             {v4.d}1, x2, x3
+    ret
+endfunc
+
+.macro p2s_4xN h
+function PFX(filterPixelToShort_4x\h\()_neon)
+    p2s_start
+.rept \h / 2
+#if HIGH_BIT_DEPTH
+    ld1             {v0.16b}, x0, x1
+    shl             v0.8h, v0.8h, #P2S_SHIFT
+#else
+    ld1             {v0.8b}, x0, x1
+    ushll           v0.8h, v0.8b, #P2S_SHIFT
+#endif
+    add             v2.4h, v0.4h, v31.4h
+    st1             {v2.4h}, x2, x3
+#if HIGH_BIT_DEPTH
+    ld1             {v1.16b}, x0, x1
+    shl             v1.8h, v1.8h, #P2S_SHIFT
+#else
+    ld1             {v1.8b}, x0, x1
+    ushll           v1.8h, v1.8b, #P2S_SHIFT
+#endif
+    add             v3.4h, v1.4h, v31.4h
+    st1             {v3.4h}, x2, x3
+.endr
+    ret
+endfunc
+.endm
+
+p2s_4xN 8
+p2s_4xN 16
+p2s_4xN 32
+
+.macro p2s_8xN h
+function PFX(filterPixelToShort_8x\h\()_neon)
+    p2s_start
+.rept \h / 2
+#if HIGH_BIT_DEPTH
+    ld1             {v0.16b}, x0, x1
+    ld1             {v1.16b}, x0, x1
+    shl             v0.8h, v0.8h, #P2S_SHIFT
+    shl             v1.8h, v1.8h, #P2S_SHIFT
+#else
+    ld1             {v0.8b}, x0, x1
+    ld1             {v1.8b}, x0, x1
+    ushll           v0.8h, v0.8b, #P2S_SHIFT
+    ushll           v1.8h, v1.8b, #P2S_SHIFT
+#endif
+    add             v2.8h, v0.8h, v31.8h
+    st1             {v2.8h}, x2, x3
+    add             v3.8h, v1.8h, v31.8h
+    st1             {v3.8h}, x2, x3
+.endr
+    ret
+endfunc
+.endm
+
+p2s_8xN 2
+p2s_8xN 4
+p2s_8xN 6
+p2s_8xN 8
+p2s_8xN 12
+p2s_8xN 16
+p2s_8xN 32
+p2s_8xN 64
+
+.macro p2s_12xN h
+function PFX(filterPixelToShort_12x\h\()_neon)
+    p2s_start
+    sub             x3, x3, #16
+.rept \h
+#if HIGH_BIT_DEPTH
+    ld1             {v0.16b-v1.16b}, x0, x1
+    shl             v2.8h, v0.8h, #P2S_SHIFT
+    shl             v3.8h, v1.8h, #P2S_SHIFT
+#else
+    ld1             {v0.16b}, x0, x1
+    ushll           v2.8h, v0.8b,  #P2S_SHIFT
+    ushll2          v3.8h, v0.16b, #P2S_SHIFT
+#endif
+    add             v2.8h, v2.8h, v31.8h
+    add             v3.8h, v3.8h, v31.8h
+    st1             {v2.16b}, x2, #16
+    st1             {v3.8b}, x2, x3
+.endr
+    ret
+endfunc

 
@@ -0,0 +1,386 @@
+/*****************************************************************************
+ * Copyright (C) 2021 MulticoreWare, Inc
+ *
+ * Authors: Sebastian Pop <spop@amazon.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm.S"
+#include "p2s-common.S"
+
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
+.section .rodata
+#endif
+
+.align 4
+
+.text
+
+// filterPixelToShort(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride)
+.macro p2s_2xN h
+function PFX(filterPixelToShort_2x\h\()_neon)
+    p2s_start
+.rept \h / 2
+    p2s_2x2
+.endr
+    ret
+endfunc
+.endm
+
+p2s_2xN 4
+p2s_2xN 8
+p2s_2xN 16
+
+.macro p2s_6xN h
+function PFX(filterPixelToShort_6x\h\()_neon)
+    p2s_start
+    sub             x3, x3, #8
+#if HIGH_BIT_DEPTH
+    sub             x1, x1, #8
+#endif
+.rept \h / 2
+    p2s_6x2
+.endr
+    ret
+endfunc
+.endm
+
+p2s_6xN 8
+p2s_6xN 16
+
+function PFX(filterPixelToShort_4x2_neon)
+    p2s_start
+#if HIGH_BIT_DEPTH
+    ld1             {v0.d}0, x0, x1
+    ld1             {v0.d}1, x0, x1
+    shl             v3.8h, v0.8h, #P2S_SHIFT
+#else
+    ld1             {v0.s}0, x0, x1
+    ld1             {v0.s}1, x0, x1
+    ushll           v3.8h, v0.8b, #P2S_SHIFT
+#endif
+    add             v3.8h, v3.8h, v31.8h
+    st1             {v3.d}0, x2, x3
+    st1             {v3.d}1, x2, x3
+    ret
+endfunc
+
+function PFX(filterPixelToShort_4x4_neon)
+    p2s_start
+#if HIGH_BIT_DEPTH
+    ld1             {v0.d}0, x0, x1
+    ld1             {v0.d}1, x0, x1
+    shl             v3.8h, v0.8h, #P2S_SHIFT
+#else
+    ld1             {v0.s}0, x0, x1
+    ld1             {v0.s}1, x0, x1
+    ushll           v3.8h, v0.8b, #P2S_SHIFT
+#endif
+    add             v3.8h, v3.8h, v31.8h
+    st1             {v3.d}0, x2, x3
+    st1             {v3.d}1, x2, x3
+#if HIGH_BIT_DEPTH
+    ld1             {v1.d}0, x0, x1
+    ld1             {v1.d}1, x0, x1
+    shl             v4.8h, v1.8h, #P2S_SHIFT
+#else
+    ld1             {v1.s}0, x0, x1
+    ld1             {v1.s}1, x0, x1
+    ushll           v4.8h, v1.8b, #P2S_SHIFT
+#endif
+    add             v4.8h, v4.8h, v31.8h
+    st1             {v4.d}0, x2, x3
+    st1             {v4.d}1, x2, x3
+    ret
+endfunc
+
+.macro p2s_4xN h
+function PFX(filterPixelToShort_4x\h\()_neon)
+    p2s_start
+.rept \h / 2
+#if HIGH_BIT_DEPTH
+    ld1             {v0.16b}, x0, x1
+    shl             v0.8h, v0.8h, #P2S_SHIFT
+#else
+    ld1             {v0.8b}, x0, x1
+    ushll           v0.8h, v0.8b, #P2S_SHIFT
+#endif
+    add             v2.4h, v0.4h, v31.4h
+    st1             {v2.4h}, x2, x3
+#if HIGH_BIT_DEPTH
+    ld1             {v1.16b}, x0, x1
+    shl             v1.8h, v1.8h, #P2S_SHIFT
+#else
+    ld1             {v1.8b}, x0, x1
+    ushll           v1.8h, v1.8b, #P2S_SHIFT
+#endif
+    add             v3.4h, v1.4h, v31.4h
+    st1             {v3.4h}, x2, x3
+.endr
+    ret
+endfunc
+.endm
+
+p2s_4xN 8
+p2s_4xN 16
+p2s_4xN 32
+
+.macro p2s_8xN h
+function PFX(filterPixelToShort_8x\h\()_neon)
+    p2s_start
+.rept \h / 2
+#if HIGH_BIT_DEPTH
+    ld1             {v0.16b}, x0, x1
+    ld1             {v1.16b}, x0, x1
+    shl             v0.8h, v0.8h, #P2S_SHIFT
+    shl             v1.8h, v1.8h, #P2S_SHIFT
+#else
+    ld1             {v0.8b}, x0, x1
+    ld1             {v1.8b}, x0, x1
+    ushll           v0.8h, v0.8b, #P2S_SHIFT
+    ushll           v1.8h, v1.8b, #P2S_SHIFT
+#endif
+    add             v2.8h, v0.8h, v31.8h
+    st1             {v2.8h}, x2, x3
+    add             v3.8h, v1.8h, v31.8h
+    st1             {v3.8h}, x2, x3
+.endr
+    ret
+endfunc
+.endm
+
+p2s_8xN 2
+p2s_8xN 4
+p2s_8xN 6
+p2s_8xN 8
+p2s_8xN 12
+p2s_8xN 16
+p2s_8xN 32
+p2s_8xN 64
+
+.macro p2s_12xN h
+function PFX(filterPixelToShort_12x\h\()_neon)
+    p2s_start
+    sub             x3, x3, #16
+.rept \h
+#if HIGH_BIT_DEPTH
+    ld1             {v0.16b-v1.16b}, x0, x1
+    shl             v2.8h, v0.8h, #P2S_SHIFT
+    shl             v3.8h, v1.8h, #P2S_SHIFT
+#else
+    ld1             {v0.16b}, x0, x1
+    ushll           v2.8h, v0.8b,  #P2S_SHIFT
+    ushll2          v3.8h, v0.16b, #P2S_SHIFT
+#endif
+    add             v2.8h, v2.8h, v31.8h
+    add             v3.8h, v3.8h, v31.8h
+    st1             {v2.16b}, x2, #16
+    st1             {v3.8b}, x2, x3
+.endr
+    ret
+endfunc
​

x265_3.6.tar.gz/source/common/aarch64/pixel-prim.cpp Added

@@ -0,0 +1,2059 @@
+#include "common.h"
+#include "slicetype.h"      // LOWRES_COST_MASK
+#include "primitives.h"
+#include "x265.h"
+
+#include "pixel-prim.h"
+#include "arm64-utils.h"
+#if HAVE_NEON
+
+#include <arm_neon.h>
+
+using namespace X265_NS;
+
+
+
+namespace
+{
+
+
+/* SATD SA8D variants - based on x264 */
+static inline void SUMSUB_AB(int16x8_t &sum, int16x8_t &sub, const int16x8_t a, const int16x8_t b)
+{
+    sum = vaddq_s16(a, b);
+    sub = vsubq_s16(a, b);
+}
+
+static inline void transpose_8h(int16x8_t &t1, int16x8_t &t2, const int16x8_t s1, const int16x8_t s2)
+{
+    t1 = vtrn1q_s16(s1, s2);
+    t2 = vtrn2q_s16(s1, s2);
+}
+
+static inline void transpose_4s(int16x8_t &t1, int16x8_t &t2, const int16x8_t s1, const int16x8_t s2)
+{
+    t1 = vtrn1q_s32(s1, s2);
+    t2 = vtrn2q_s32(s1, s2);
+}
+
+#if (X265_DEPTH <= 10)
+static inline void transpose_2d(int16x8_t &t1, int16x8_t &t2, const int16x8_t s1, const int16x8_t s2)
+{
+    t1 = vtrn1q_s64(s1, s2);
+    t2 = vtrn2q_s64(s1, s2);
+}
+#endif
+
+
+static inline void SUMSUB_ABCD(int16x8_t &s1, int16x8_t &d1, int16x8_t &s2, int16x8_t &d2,
+                               int16x8_t a, int16x8_t  b, int16x8_t  c, int16x8_t  d)
+{
+    SUMSUB_AB(s1, d1, a, b);
+    SUMSUB_AB(s2, d2, c, d);
+}
+
+static inline void HADAMARD4_V(int16x8_t &r1, int16x8_t &r2, int16x8_t &r3, int16x8_t &r4,
+                               int16x8_t &t1, int16x8_t &t2, int16x8_t &t3, int16x8_t &t4)
+{
+    SUMSUB_ABCD(t1, t2, t3, t4, r1, r2, r3, r4);
+    SUMSUB_ABCD(r1, r3, r2, r4, t1, t3, t2, t4);
+}
+
+
+static int _satd_4x8_8x4_end_neon(int16x8_t v0, int16x8_t v1, int16x8_t v2, int16x8_t v3)
+
+{
+
+    int16x8_t v4, v5, v6, v7, v16, v17, v18, v19;
+
+
+    SUMSUB_AB(v16, v17, v0,  v1);
+    SUMSUB_AB(v18, v19, v2,  v3);
+
+    SUMSUB_AB(v4 , v6 , v16, v18);
+    SUMSUB_AB(v5 , v7 , v17, v19);
+
+    v0 = vtrn1q_s16(v4, v5);
+    v1 = vtrn2q_s16(v4, v5);
+    v2 = vtrn1q_s16(v6, v7);
+    v3 = vtrn2q_s16(v6, v7);
+
+    SUMSUB_AB(v16, v17, v0,  v1);
+    SUMSUB_AB(v18, v19, v2,  v3);
+
+    v0 = vtrn1q_s32(v16, v18);
+    v1 = vtrn2q_s32(v16, v18);
+    v2 = vtrn1q_s32(v17, v19);
+    v3 = vtrn2q_s32(v17, v19);
+
+    v0 = vabsq_s16(v0);
+    v1 = vabsq_s16(v1);
+    v2 = vabsq_s16(v2);
+    v3 = vabsq_s16(v3);
+
+    v0 = vmaxq_u16(v0, v1);
+    v1 = vmaxq_u16(v2, v3);
+
+    v0 = vaddq_u16(v0, v1);
+    return vaddlvq_u16(v0);
+}
+
+static inline int _satd_4x4_neon(int16x8_t v0, int16x8_t v1)
+{
+    int16x8_t v2, v3;
+    SUMSUB_AB(v2,  v3,  v0,  v1);
+
+    v0 = vzip1q_s64(v2, v3);
+    v1 = vzip2q_s64(v2, v3);
+    SUMSUB_AB(v2,  v3,  v0,  v1);
+
+    v0 = vtrn1q_s16(v2, v3);
+    v1 = vtrn2q_s16(v2, v3);
+    SUMSUB_AB(v2,  v3,  v0,  v1);
+
+    v0 = vtrn1q_s32(v2, v3);
+    v1 = vtrn2q_s32(v2, v3);
+
+    v0 = vabsq_s16(v0);
+    v1 = vabsq_s16(v1);
+    v0 = vmaxq_u16(v0, v1);
+
+    return vaddlvq_s16(v0);
+}
+
+static void _satd_8x4v_8x8h_neon(int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, int16x8_t &v3, int16x8_t &v20,
+                                 int16x8_t &v21, int16x8_t &v22, int16x8_t &v23)
+{
+    int16x8_t v16, v17, v18, v19, v4, v5, v6, v7;
+
+    SUMSUB_AB(v16, v18, v0,  v2);
+    SUMSUB_AB(v17, v19, v1,  v3);
+
+    HADAMARD4_V(v20, v21, v22, v23, v0,  v1, v2, v3);
+
+    transpose_8h(v0,  v1,  v16, v17);
+    transpose_8h(v2,  v3,  v18, v19);
+    transpose_8h(v4,  v5,  v20, v21);
+    transpose_8h(v6,  v7,  v22, v23);
+
+    SUMSUB_AB(v16, v17, v0,  v1);
+    SUMSUB_AB(v18, v19, v2,  v3);
+    SUMSUB_AB(v20, v21, v4,  v5);
+    SUMSUB_AB(v22, v23, v6,  v7);
+
+    transpose_4s(v0,  v2,  v16, v18);
+    transpose_4s(v1,  v3,  v17, v19);
+    transpose_4s(v4,  v6,  v20, v22);
+    transpose_4s(v5,  v7,  v21, v23);
+
+    v0 = vabsq_s16(v0);
+    v1 = vabsq_s16(v1);
+    v2 = vabsq_s16(v2);
+    v3 = vabsq_s16(v3);
+    v4 = vabsq_s16(v4);
+    v5 = vabsq_s16(v5);
+    v6 = vabsq_s16(v6);
+    v7 = vabsq_s16(v7);
+
+    v0 = vmaxq_u16(v0, v2);
+    v1 = vmaxq_u16(v1, v3);
+    v2 = vmaxq_u16(v4, v6);
+    v3 = vmaxq_u16(v5, v7);
+
+}
+
+#if HIGH_BIT_DEPTH
+
+#if (X265_DEPTH > 10)
+static inline void transpose_2d(int32x4_t &t1, int32x4_t &t2, const int32x4_t s1, const int32x4_t s2)
+{
+    t1 = vtrn1q_s64(s1, s2);
+    t2 = vtrn2q_s64(s1, s2);
+}
+
+static inline void ISUMSUB_AB(int32x4_t &sum, int32x4_t &sub, const int32x4_t a, const int32x4_t b)
+{
+    sum = vaddq_s32(a, b);
+    sub = vsubq_s32(a, b);
+}
+
+static inline void ISUMSUB_AB_FROM_INT16(int32x4_t &suml, int32x4_t &sumh, int32x4_t &subl, int32x4_t &subh,
+        const int16x8_t a, const int16x8_t b)
+{
+    suml = vaddl_s16(vget_low_s16(a), vget_low_s16(b));
+    sumh = vaddl_high_s16(a, b);
+    subl = vsubl_s16(vget_low_s16(a), vget_low_s16(b));
+    subh = vsubl_high_s16(a, b);
+}
+
+#endif
+
+static inline void _sub_8x8_fly(const uint16_t *pix1, intptr_t stride_pix1, const uint16_t *pix2, intptr_t stride_pix2,
+                                int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, int16x8_t &v3,
+                                int16x8_t &v20, int16x8_t &v21, int16x8_t &v22, int16x8_t &v23)
+{
+    uint16x8_t r0, r1, r2, r3;
+    uint16x8_t t0, t1, t2, t3;
+    int16x8_t v16, v17;
+    int16x8_t v18, v19;
+

 
@@ -0,0 +1,2059 @@
+#include "common.h"
+#include "slicetype.h"      // LOWRES_COST_MASK
+#include "primitives.h"
+#include "x265.h"
+
+#include "pixel-prim.h"
+#include "arm64-utils.h"
+#if HAVE_NEON
+
+#include <arm_neon.h>
+
+using namespace X265_NS;
+
+
+
+namespace
+{
+
+
+/* SATD SA8D variants - based on x264 */
+static inline void SUMSUB_AB(int16x8_t &sum, int16x8_t &sub, const int16x8_t a, const int16x8_t b)
+{
+    sum = vaddq_s16(a, b);
+    sub = vsubq_s16(a, b);
+}
+
+static inline void transpose_8h(int16x8_t &t1, int16x8_t &t2, const int16x8_t s1, const int16x8_t s2)
+{
+    t1 = vtrn1q_s16(s1, s2);
+    t2 = vtrn2q_s16(s1, s2);
+}
+
+static inline void transpose_4s(int16x8_t &t1, int16x8_t &t2, const int16x8_t s1, const int16x8_t s2)
+{
+    t1 = vtrn1q_s32(s1, s2);
+    t2 = vtrn2q_s32(s1, s2);
+}
+
+#if (X265_DEPTH <= 10)
+static inline void transpose_2d(int16x8_t &t1, int16x8_t &t2, const int16x8_t s1, const int16x8_t s2)
+{
+    t1 = vtrn1q_s64(s1, s2);
+    t2 = vtrn2q_s64(s1, s2);
+}
+#endif
+
+
+static inline void SUMSUB_ABCD(int16x8_t &s1, int16x8_t &d1, int16x8_t &s2, int16x8_t &d2,
+                               int16x8_t a, int16x8_t  b, int16x8_t  c, int16x8_t  d)
+{
+    SUMSUB_AB(s1, d1, a, b);
+    SUMSUB_AB(s2, d2, c, d);
+}
+
+static inline void HADAMARD4_V(int16x8_t &r1, int16x8_t &r2, int16x8_t &r3, int16x8_t &r4,
+                               int16x8_t &t1, int16x8_t &t2, int16x8_t &t3, int16x8_t &t4)
+{
+    SUMSUB_ABCD(t1, t2, t3, t4, r1, r2, r3, r4);
+    SUMSUB_ABCD(r1, r3, r2, r4, t1, t3, t2, t4);
+}
+
+
+static int _satd_4x8_8x4_end_neon(int16x8_t v0, int16x8_t v1, int16x8_t v2, int16x8_t v3)
+
+{
+
+    int16x8_t v4, v5, v6, v7, v16, v17, v18, v19;
+
+
+    SUMSUB_AB(v16, v17, v0,  v1);
+    SUMSUB_AB(v18, v19, v2,  v3);
+
+    SUMSUB_AB(v4 , v6 , v16, v18);
+    SUMSUB_AB(v5 , v7 , v17, v19);
+
+    v0 = vtrn1q_s16(v4, v5);
+    v1 = vtrn2q_s16(v4, v5);
+    v2 = vtrn1q_s16(v6, v7);
+    v3 = vtrn2q_s16(v6, v7);
+
+    SUMSUB_AB(v16, v17, v0,  v1);
+    SUMSUB_AB(v18, v19, v2,  v3);
+
+    v0 = vtrn1q_s32(v16, v18);
+    v1 = vtrn2q_s32(v16, v18);
+    v2 = vtrn1q_s32(v17, v19);
+    v3 = vtrn2q_s32(v17, v19);
+
+    v0 = vabsq_s16(v0);
+    v1 = vabsq_s16(v1);
+    v2 = vabsq_s16(v2);
+    v3 = vabsq_s16(v3);
+
+    v0 = vmaxq_u16(v0, v1);
+    v1 = vmaxq_u16(v2, v3);
+
+    v0 = vaddq_u16(v0, v1);
+    return vaddlvq_u16(v0);
+}
+
+static inline int _satd_4x4_neon(int16x8_t v0, int16x8_t v1)
+{
+    int16x8_t v2, v3;
+    SUMSUB_AB(v2,  v3,  v0,  v1);
+
+    v0 = vzip1q_s64(v2, v3);
+    v1 = vzip2q_s64(v2, v3);
+    SUMSUB_AB(v2,  v3,  v0,  v1);
+
+    v0 = vtrn1q_s16(v2, v3);
+    v1 = vtrn2q_s16(v2, v3);
+    SUMSUB_AB(v2,  v3,  v0,  v1);
+
+    v0 = vtrn1q_s32(v2, v3);
+    v1 = vtrn2q_s32(v2, v3);
+
+    v0 = vabsq_s16(v0);
+    v1 = vabsq_s16(v1);
+    v0 = vmaxq_u16(v0, v1);
+
+    return vaddlvq_s16(v0);
+}
+
+static void _satd_8x4v_8x8h_neon(int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, int16x8_t &v3, int16x8_t &v20,
+                                 int16x8_t &v21, int16x8_t &v22, int16x8_t &v23)
+{
+    int16x8_t v16, v17, v18, v19, v4, v5, v6, v7;
+
+    SUMSUB_AB(v16, v18, v0,  v2);
+    SUMSUB_AB(v17, v19, v1,  v3);
+
+    HADAMARD4_V(v20, v21, v22, v23, v0,  v1, v2, v3);
+
+    transpose_8h(v0,  v1,  v16, v17);
+    transpose_8h(v2,  v3,  v18, v19);
+    transpose_8h(v4,  v5,  v20, v21);
+    transpose_8h(v6,  v7,  v22, v23);
+
+    SUMSUB_AB(v16, v17, v0,  v1);
+    SUMSUB_AB(v18, v19, v2,  v3);
+    SUMSUB_AB(v20, v21, v4,  v5);
+    SUMSUB_AB(v22, v23, v6,  v7);
+
+    transpose_4s(v0,  v2,  v16, v18);
+    transpose_4s(v1,  v3,  v17, v19);
+    transpose_4s(v4,  v6,  v20, v22);
+    transpose_4s(v5,  v7,  v21, v23);
+
+    v0 = vabsq_s16(v0);
+    v1 = vabsq_s16(v1);
+    v2 = vabsq_s16(v2);
+    v3 = vabsq_s16(v3);
+    v4 = vabsq_s16(v4);
+    v5 = vabsq_s16(v5);
+    v6 = vabsq_s16(v6);
+    v7 = vabsq_s16(v7);
+
+    v0 = vmaxq_u16(v0, v2);
+    v1 = vmaxq_u16(v1, v3);
+    v2 = vmaxq_u16(v4, v6);
+    v3 = vmaxq_u16(v5, v7);
+
+}
+
+#if HIGH_BIT_DEPTH
+
+#if (X265_DEPTH > 10)
+static inline void transpose_2d(int32x4_t &t1, int32x4_t &t2, const int32x4_t s1, const int32x4_t s2)
+{
+    t1 = vtrn1q_s64(s1, s2);
+    t2 = vtrn2q_s64(s1, s2);
+}
+
+static inline void ISUMSUB_AB(int32x4_t &sum, int32x4_t &sub, const int32x4_t a, const int32x4_t b)
+{
+    sum = vaddq_s32(a, b);
+    sub = vsubq_s32(a, b);
+}
+
+static inline void ISUMSUB_AB_FROM_INT16(int32x4_t &suml, int32x4_t &sumh, int32x4_t &subl, int32x4_t &subh,
+        const int16x8_t a, const int16x8_t b)
+{
+    suml = vaddl_s16(vget_low_s16(a), vget_low_s16(b));
+    sumh = vaddl_high_s16(a, b);
+    subl = vsubl_s16(vget_low_s16(a), vget_low_s16(b));
+    subh = vsubl_high_s16(a, b);
+}
+
+#endif
+
+static inline void _sub_8x8_fly(const uint16_t *pix1, intptr_t stride_pix1, const uint16_t *pix2, intptr_t stride_pix2,
+                                int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, int16x8_t &v3,
+                                int16x8_t &v20, int16x8_t &v21, int16x8_t &v22, int16x8_t &v23)
+{
+    uint16x8_t r0, r1, r2, r3;
+    uint16x8_t t0, t1, t2, t3;
+    int16x8_t v16, v17;
+    int16x8_t v18, v19;
+
​

x265_3.6.tar.gz/source/common/aarch64/pixel-prim.h Added

 
@@ -0,0 +1,23 @@
+#ifndef PIXEL_PRIM_NEON_H__
+#define PIXEL_PRIM_NEON_H__
+
+#include "common.h"
+#include "slicetype.h"      // LOWRES_COST_MASK
+#include "primitives.h"
+#include "x265.h"
+
+
+
+namespace X265_NS
+{
+
+
+
+void setupPixelPrimitives_neon(EncoderPrimitives &p);
+
+
+}
+
+
+#endif
+
​

x265_3.6.tar.gz/source/common/aarch64/pixel-util-common.S Added

@@ -0,0 +1,84 @@
+/*****************************************************************************
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
+ *
+ * Authors: David Chen <david.chen@myais.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+// This file contains the macros written using NEON instruction set
+// that are also used by the SVE2 functions
+
+.arch           armv8-a
+
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
+.section .rodata
+#endif
+
+.align 4
+
+.macro pixel_var_start
+    movi            v0.16b, #0
+    movi            v1.16b, #0
+    movi            v2.16b, #0
+    movi            v3.16b, #0
+.endm
+
+.macro pixel_var_1 v
+    uaddw           v0.8h, v0.8h, \v\().8b
+    umull           v30.8h, \v\().8b, \v\().8b
+    uaddw2          v1.8h, v1.8h, \v\().16b
+    umull2          v31.8h, \v\().16b, \v\().16b
+    uadalp          v2.4s, v30.8h
+    uadalp          v3.4s, v31.8h
+.endm
+
+.macro pixel_var_end
+    uaddlv          s0, v0.8h
+    uaddlv          s1, v1.8h
+    add             v2.4s, v2.4s, v3.4s
+    fadd            s0, s0, s1
+    uaddlv          d2, v2.4s
+    fmov            w0, s0
+    fmov            x2, d2
+    orr             x0, x0, x2, lsl #32
+.endm
+
+.macro ssimDist_start
+    movi            v0.16b, #0
+    movi            v1.16b, #0
+.endm
+
+.macro ssimDist_end
+    uaddlv          d0, v0.4s
+    uaddlv          d1, v1.4s
+    str             d0, x6
+    str             d1, x4
+.endm
+
+.macro normFact_start
+    movi            v0.16b, #0
+.endm
+
+.macro normFact_end
+    uaddlv          d0, v0.4s
+    str             d0, x3
+.endm
+

 
@@ -0,0 +1,84 @@
+/*****************************************************************************
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
+ *
+ * Authors: David Chen <david.chen@myais.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+// This file contains the macros written using NEON instruction set
+// that are also used by the SVE2 functions
+
+.arch           armv8-a
+
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
+.section .rodata
+#endif
+
+.align 4
+
+.macro pixel_var_start
+    movi            v0.16b, #0
+    movi            v1.16b, #0
+    movi            v2.16b, #0
+    movi            v3.16b, #0
+.endm
+
+.macro pixel_var_1 v
+    uaddw           v0.8h, v0.8h, \v\().8b
+    umull           v30.8h, \v\().8b, \v\().8b
+    uaddw2          v1.8h, v1.8h, \v\().16b
+    umull2          v31.8h, \v\().16b, \v\().16b
+    uadalp          v2.4s, v30.8h
+    uadalp          v3.4s, v31.8h
+.endm
+
+.macro pixel_var_end
+    uaddlv          s0, v0.8h
+    uaddlv          s1, v1.8h
+    add             v2.4s, v2.4s, v3.4s
+    fadd            s0, s0, s1
+    uaddlv          d2, v2.4s
+    fmov            w0, s0
+    fmov            x2, d2
+    orr             x0, x0, x2, lsl #32
+.endm
+
+.macro ssimDist_start
+    movi            v0.16b, #0
+    movi            v1.16b, #0
+.endm
+
+.macro ssimDist_end
+    uaddlv          d0, v0.4s
+    uaddlv          d1, v1.4s
+    str             d0, x6
+    str             d1, x4
+.endm
+
+.macro normFact_start
+    movi            v0.16b, #0
+.endm
+
+.macro normFact_end
+    uaddlv          d0, v0.4s
+    str             d0, x3
+.endm
+
​

x265_3.6.tar.gz/source/common/aarch64/pixel-util-sve.S Added

@@ -0,0 +1,373 @@
+/*****************************************************************************
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
+ *
+ * Authors: David Chen <david.chen@myais.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm-sve.S"
+#include "pixel-util-common.S"
+
+.arch armv8-a+sve
+
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
+.section .rodata
+#endif
+
+.align 4
+
+.text
+
+function PFX(pixel_sub_ps_8x16_sve)
+    lsl             x1, x1, #1
+    ptrue           p0.h, vl8
+.rept 8
+    ld1b            {z0.h}, p0/z, x2
+    ld1b            {z1.h}, p0/z, x3
+    add             x2, x2, x4
+    add             x3, x3, x5
+    ld1b            {z2.h}, p0/z, x2
+    ld1b            {z3.h}, p0/z, x3
+    add             x2, x2, x4
+    add             x3, x3, x5
+    sub             z4.h, z0.h, z1.h
+    sub             z5.h, z2.h, z3.h
+    st1             {v4.8h}, x0, x1
+    st1             {v5.8h}, x0, x1
+.endr
+    ret
+endfunc
+
+//******* satd *******
+.macro satd_4x4_sve
+    ld1b            {z0.h}, p0/z, x0
+    ld1b            {z2.h}, p0/z, x2
+    add             x0, x0, x1
+    add             x2, x2, x3
+    ld1b            {z1.h}, p0/z, x0
+    ld1b            {z3.h}, p0/z, x2
+    add             x0, x0, x1
+    add             x2, x2, x3
+    ld1b            {z4.h}, p0/z, x0
+    ld1b            {z6.h}, p0/z, x2
+    add             x0, x0, x1
+    add             x2, x2, x3
+    ld1b            {z5.h}, p0/z, x0
+    ld1b            {z7.h}, p0/z, x2
+    add             x0, x0, x1
+    add             x2, x2, x3
+
+    sub             z0.h, z0.h, z2.h
+    sub             z1.h, z1.h, z3.h
+    sub             z2.h, z4.h, z6.h
+    sub             z3.h, z5.h, z7.h
+
+    add             z4.h, z0.h, z2.h
+    add             z5.h, z1.h, z3.h
+    sub             z6.h, z0.h, z2.h
+    sub             z7.h, z1.h, z3.h
+
+    add             z0.h, z4.h, z5.h
+    sub             z1.h, z4.h, z5.h
+
+    add             z2.h, z6.h, z7.h
+    sub             z3.h, z6.h, z7.h
+
+    trn1            z4.h, z0.h, z2.h
+    trn2            z5.h, z0.h, z2.h
+
+    trn1            z6.h, z1.h, z3.h
+    trn2            z7.h, z1.h, z3.h
+
+    add             z0.h, z4.h, z5.h
+    sub             z1.h, z4.h, z5.h
+
+    add             z2.h, z6.h, z7.h
+    sub             z3.h, z6.h, z7.h
+
+    trn1            z4.s, z0.s, z1.s
+    trn2            z5.s, z0.s, z1.s
+
+    trn1            z6.s, z2.s, z3.s
+    trn2            z7.s, z2.s, z3.s
+
+    abs             z4.h, p0/m, z4.h
+    abs             z5.h, p0/m, z5.h
+    abs             z6.h, p0/m, z6.h
+    abs             z7.h, p0/m, z7.h
+
+    smax            z4.h, p0/m, z4.h, z5.h
+    smax            z6.h, p0/m, z6.h, z7.h
+
+    add             z0.h, z4.h, z6.h
+
+    uaddlp          v0.2s, v0.4h
+    uaddlp          v0.1d, v0.2s
+.endm
+
+// int satd_4x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
+function PFX(pixel_satd_4x4_sve)
+    ptrue           p0.h, vl4
+    satd_4x4_sve
+    fmov            x0, d0
+    ret
+endfunc
+
+function PFX(pixel_satd_8x4_sve)
+    ptrue           p0.h, vl4
+    mov             x4, x0
+    mov             x5, x2
+    satd_4x4_sve
+    add             x0, x4, #4
+    add             x2, x5, #4
+    umov            x6, v0.d0
+    satd_4x4_sve
+    umov            x0, v0.d0
+    add             x0, x0, x6
+    ret
+endfunc
+
+function PFX(pixel_satd_8x12_sve)
+    ptrue           p0.h, vl4
+    mov             x4, x0
+    mov             x5, x2
+    mov             x7, #0
+    satd_4x4_sve
+    umov            x6, v0.d0
+    add             x7, x7, x6
+    add             x0, x4, #4
+    add             x2, x5, #4
+    satd_4x4_sve
+    umov            x6, v0.d0
+    add             x7, x7, x6
+.rept 2
+    sub             x0, x0, #4
+    sub             x2, x2, #4
+    mov             x4, x0
+    mov             x5, x2
+    satd_4x4_sve
+    umov            x6, v0.d0
+    add             x7, x7, x6
+    add             x0, x4, #4
+    add             x2, x5, #4
+    satd_4x4_sve
+    umov            x6, v0.d0
+    add             x7, x7, x6
+.endr
+    mov             x0, x7
+    ret
+endfunc
+
+.macro LOAD_DIFF_16x4_sve v0 v1 v2 v3 v4 v5 v6 v7
+    mov             x11, #8 // in order to consider CPUs whose vector size is greater than 128 bits
+    ld1b            {z0.h}, p0/z, x0
+    ld1b            {z1.h}, p0/z, x0, x11
+    ld1b            {z2.h}, p0/z, x2
+    ld1b            {z3.h}, p0/z, x2, x11
+    add             x0, x0, x1
+    add             x2, x2, x3
+    ld1b            {z4.h}, p0/z, x0
+    ld1b            {z5.h}, p0/z, x0, x11
+    ld1b            {z6.h}, p0/z, x2
+    ld1b            {z7.h}, p0/z, x2, x11
+    add             x0, x0, x1
+    add             x2, x2, x3
+    ld1b            {z29.h}, p0/z, x0
+    ld1b            {z9.h}, p0/z, x0, x11
+    ld1b            {z10.h}, p0/z, x2
+    ld1b            {z11.h}, p0/z, x2, x11
+    add             x0, x0, x1
+    add             x2, x2, x3
+    ld1b            {z12.h}, p0/z, x0

 
@@ -0,0 +1,373 @@
+/*****************************************************************************
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
+ *
+ * Authors: David Chen <david.chen@myais.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm-sve.S"
+#include "pixel-util-common.S"
+
+.arch armv8-a+sve
+
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
+.section .rodata
+#endif
+
+.align 4
+
+.text
+
+function PFX(pixel_sub_ps_8x16_sve)
+    lsl             x1, x1, #1
+    ptrue           p0.h, vl8
+.rept 8
+    ld1b            {z0.h}, p0/z, x2
+    ld1b            {z1.h}, p0/z, x3
+    add             x2, x2, x4
+    add             x3, x3, x5
+    ld1b            {z2.h}, p0/z, x2
+    ld1b            {z3.h}, p0/z, x3
+    add             x2, x2, x4
+    add             x3, x3, x5
+    sub             z4.h, z0.h, z1.h
+    sub             z5.h, z2.h, z3.h
+    st1             {v4.8h}, x0, x1
+    st1             {v5.8h}, x0, x1
+.endr
+    ret
+endfunc
+
+//******* satd *******
+.macro satd_4x4_sve
+    ld1b            {z0.h}, p0/z, x0
+    ld1b            {z2.h}, p0/z, x2
+    add             x0, x0, x1
+    add             x2, x2, x3
+    ld1b            {z1.h}, p0/z, x0
+    ld1b            {z3.h}, p0/z, x2
+    add             x0, x0, x1
+    add             x2, x2, x3
+    ld1b            {z4.h}, p0/z, x0
+    ld1b            {z6.h}, p0/z, x2
+    add             x0, x0, x1
+    add             x2, x2, x3
+    ld1b            {z5.h}, p0/z, x0
+    ld1b            {z7.h}, p0/z, x2
+    add             x0, x0, x1
+    add             x2, x2, x3
+
+    sub             z0.h, z0.h, z2.h
+    sub             z1.h, z1.h, z3.h
+    sub             z2.h, z4.h, z6.h
+    sub             z3.h, z5.h, z7.h
+
+    add             z4.h, z0.h, z2.h
+    add             z5.h, z1.h, z3.h
+    sub             z6.h, z0.h, z2.h
+    sub             z7.h, z1.h, z3.h
+
+    add             z0.h, z4.h, z5.h
+    sub             z1.h, z4.h, z5.h
+
+    add             z2.h, z6.h, z7.h
+    sub             z3.h, z6.h, z7.h
+
+    trn1            z4.h, z0.h, z2.h
+    trn2            z5.h, z0.h, z2.h
+
+    trn1            z6.h, z1.h, z3.h
+    trn2            z7.h, z1.h, z3.h
+
+    add             z0.h, z4.h, z5.h
+    sub             z1.h, z4.h, z5.h
+
+    add             z2.h, z6.h, z7.h
+    sub             z3.h, z6.h, z7.h
+
+    trn1            z4.s, z0.s, z1.s
+    trn2            z5.s, z0.s, z1.s
+
+    trn1            z6.s, z2.s, z3.s
+    trn2            z7.s, z2.s, z3.s
+
+    abs             z4.h, p0/m, z4.h
+    abs             z5.h, p0/m, z5.h
+    abs             z6.h, p0/m, z6.h
+    abs             z7.h, p0/m, z7.h
+
+    smax            z4.h, p0/m, z4.h, z5.h
+    smax            z6.h, p0/m, z6.h, z7.h
+
+    add             z0.h, z4.h, z6.h
+
+    uaddlp          v0.2s, v0.4h
+    uaddlp          v0.1d, v0.2s
+.endm
+
+// int satd_4x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
+function PFX(pixel_satd_4x4_sve)
+    ptrue           p0.h, vl4
+    satd_4x4_sve
+    fmov            x0, d0
+    ret
+endfunc
+
+function PFX(pixel_satd_8x4_sve)
+    ptrue           p0.h, vl4
+    mov             x4, x0
+    mov             x5, x2
+    satd_4x4_sve
+    add             x0, x4, #4
+    add             x2, x5, #4
+    umov            x6, v0.d0
+    satd_4x4_sve
+    umov            x0, v0.d0
+    add             x0, x0, x6
+    ret
+endfunc
+
+function PFX(pixel_satd_8x12_sve)
+    ptrue           p0.h, vl4
+    mov             x4, x0
+    mov             x5, x2
+    mov             x7, #0
+    satd_4x4_sve
+    umov            x6, v0.d0
+    add             x7, x7, x6
+    add             x0, x4, #4
+    add             x2, x5, #4
+    satd_4x4_sve
+    umov            x6, v0.d0
+    add             x7, x7, x6
+.rept 2
+    sub             x0, x0, #4
+    sub             x2, x2, #4
+    mov             x4, x0
+    mov             x5, x2
+    satd_4x4_sve
+    umov            x6, v0.d0
+    add             x7, x7, x6
+    add             x0, x4, #4
+    add             x2, x5, #4
+    satd_4x4_sve
+    umov            x6, v0.d0
+    add             x7, x7, x6
+.endr
+    mov             x0, x7
+    ret
+endfunc
+
+.macro LOAD_DIFF_16x4_sve v0 v1 v2 v3 v4 v5 v6 v7
+    mov             x11, #8 // in order to consider CPUs whose vector size is greater than 128 bits
+    ld1b            {z0.h}, p0/z, x0
+    ld1b            {z1.h}, p0/z, x0, x11
+    ld1b            {z2.h}, p0/z, x2
+    ld1b            {z3.h}, p0/z, x2, x11
+    add             x0, x0, x1
+    add             x2, x2, x3
+    ld1b            {z4.h}, p0/z, x0
+    ld1b            {z5.h}, p0/z, x0, x11
+    ld1b            {z6.h}, p0/z, x2
+    ld1b            {z7.h}, p0/z, x2, x11
+    add             x0, x0, x1
+    add             x2, x2, x3
+    ld1b            {z29.h}, p0/z, x0
+    ld1b            {z9.h}, p0/z, x0, x11
+    ld1b            {z10.h}, p0/z, x2
+    ld1b            {z11.h}, p0/z, x2, x11
+    add             x0, x0, x1
+    add             x2, x2, x3
+    ld1b            {z12.h}, p0/z, x0
​

x265_3.6.tar.gz/source/common/aarch64/pixel-util-sve2.S Added

@@ -0,0 +1,1686 @@
+/*****************************************************************************
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
+ *
+ * Authors: David Chen <david.chen@myais.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm-sve.S"
+#include "pixel-util-common.S"
+
+.arch armv8-a+sve2
+
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
+.section .rodata
+#endif
+
+.align 4
+
+.text
+
+// uint64_t pixel_var(const pixel* pix, intptr_t i_stride)
+function PFX(pixel_var_8x8_sve2)
+    ptrue           p0.h, vl8
+    ld1b            {z0.h}, p0/z, x0
+    add             x0, x0, x1
+    mul             z31.h, z0.h, z0.h
+    uaddlp          v1.4s, v31.8h
+.rept 7
+    ld1b            {z4.h}, p0/z, x0
+    add             x0, x0, x1
+    add             z0.h, z0.h, z4.h
+    mul             z31.h, z4.h, z4.h
+    uadalp          z1.s, p0/m, z31.h
+.endr
+    uaddlv          s0, v0.8h
+    uaddlv          d1, v1.4s
+    fmov            w0, s0
+    fmov            x1, d1
+    orr             x0, x0, x1, lsl #32
+    ret
+endfunc
+
+function PFX(pixel_var_16x16_sve2)
+    rdvl            x9, #1
+    cmp             x9, #16
+    bgt             .vl_gt_16_pixel_var_16x16
+    pixel_var_start
+    mov             w12, #16
+.loop_var_16_sve2:
+    sub             w12, w12, #1
+    ld1             {v4.16b}, x0, x1
+    pixel_var_1 v4
+    cbnz            w12, .loop_var_16_sve2
+    pixel_var_end
+    ret
+.vl_gt_16_pixel_var_16x16:
+    ptrue           p0.h, vl16
+    mov             z0.d, #0
+.rept 16
+    ld1b            {z4.h}, p0/z, x0
+    add             x0, x0, x1
+    add             z0.h, z0.h, z4.h
+    mul             z30.h, z4.h, z4.h
+    uadalp          z1.s, p0/m, z30.h
+.endr
+    uaddv           d0, p0, z0.h
+    uaddv           d1, p0, z1.s
+    fmov            w0, s0
+    fmov            x1, d1
+    orr             x0, x0, x1, lsl #32
+    ret
+endfunc
+
+function PFX(pixel_var_32x32_sve2)
+    rdvl            x9, #1
+    cmp             x9, #16
+    bgt             .vl_gt_16_pixel_var_32x32
+    pixel_var_start
+    mov             w12, #32
+.loop_var_32_sve2:
+    sub             w12, w12, #1
+    ld1             {v4.16b-v5.16b}, x0, x1
+    pixel_var_1 v4
+    pixel_var_1 v5
+    cbnz            w12, .loop_var_32_sve2
+    pixel_var_end
+    ret
+.vl_gt_16_pixel_var_32x32:
+    cmp             x9, #48
+    bgt             .vl_gt_48_pixel_var_32x32
+    ptrue           p0.b, vl32
+    mov             z0.d, #0
+    mov             z1.d, #0
+.rept 32
+    ld1b            {z4.b}, p0/z, x0
+    add             x0, x0, x1
+    uaddwb          z0.h, z0.h, z4.b
+    uaddwt          z0.h, z0.h, z4.b
+    umullb          z28.h, z4.b, z4.b
+    umullt          z29.h, z4.b, z4.b
+    uadalp          z1.s, p0/m, z28.h
+    uadalp          z1.s, p0/m, z29.h
+.endr
+    uaddv           d0, p0, z0.h
+    uaddv           d1, p0, z1.s
+    fmov            w0, s0
+    fmov            x1, d1
+    orr             x0, x0, x1, lsl #32
+    ret
+.vl_gt_48_pixel_var_32x32:
+    ptrue           p0.h, vl32
+    mov             z0.d, #0
+    mov             z1.d, #0
+.rept 32
+    ld1b            {z4.h}, p0/z, x0
+    add             x0, x0, x1
+    add             z0.h, z0.h, z4.h
+    mul             z28.h, z4.h, z4.h
+    uadalp          z1.s, p0/m, z28.h
+.endr
+    uaddv           d0, p0, z0.h
+    uaddv           d1, p0, z1.s
+    fmov            w0, s0
+    fmov            x1, d1
+    orr             x0, x0, x1, lsl #32
+    ret
+endfunc
+
+function PFX(pixel_var_64x64_sve2)
+    rdvl            x9, #1
+    cmp             x9, #16
+    bgt             .vl_gt_16_pixel_var_64x64
+    pixel_var_start
+    mov             w12, #64
+.loop_var_64_sve2:
+    sub             w12, w12, #1
+    ld1             {v4.16b-v7.16b}, x0, x1
+    pixel_var_1 v4
+    pixel_var_1 v5
+    pixel_var_1 v6
+    pixel_var_1 v7
+    cbnz            w12, .loop_var_64_sve2
+    pixel_var_end
+    ret
+.vl_gt_16_pixel_var_64x64:
+    cmp             x9, #48
+    bgt             .vl_gt_48_pixel_var_64x64
+    ptrue           p0.b, vl32
+    mov             z0.d, #0
+    mov             z2.d, #0
+.rept 64
+    ld1b            {z4.b}, p0/z, x0
+    ld1b            {z5.b}, p0/z, x0, #1, mul vl
+    add             x0, x0, x1
+    uaddwb          z0.h, z0.h, z4.b
+    uaddwt          z0.h, z0.h, z4.b
+    uaddwb          z0.h, z0.h, z5.b
+    uaddwt          z0.h, z0.h, z5.b
+    umullb          z24.h, z4.b, z4.b
+    umullt          z25.h, z4.b, z4.b
+    umullb          z26.h, z5.b, z5.b
+    umullt          z27.h, z5.b, z5.b
+    uadalp          z2.s, p0/m, z24.h
+    uadalp          z2.s, p0/m, z25.h
+    uadalp          z2.s, p0/m, z26.h
+    uadalp          z2.s, p0/m, z27.h
+.endr
+    uaddv           d0, p0, z0.h
+    uaddv           d1, p0, z2.s
+    fmov            w0, s0
+    fmov            x1, d1
+    orr             x0, x0, x1, lsl #32
+    ret
+.vl_gt_48_pixel_var_64x64:
+    cmp             x9, #112
+    bgt             .vl_gt_112_pixel_var_64x64
+    ptrue           p0.b, vl64
+    mov             z0.d, #0
+    mov             z1.d, #0
+.rept 64
+    ld1b            {z4.b}, p0/z, x0

 
@@ -0,0 +1,1686 @@
+/*****************************************************************************
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
+ *
+ * Authors: David Chen <david.chen@myais.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm-sve.S"
+#include "pixel-util-common.S"
+
+.arch armv8-a+sve2
+
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
+.section .rodata
+#endif
+
+.align 4
+
+.text
+
+// uint64_t pixel_var(const pixel* pix, intptr_t i_stride)
+function PFX(pixel_var_8x8_sve2)
+    ptrue           p0.h, vl8
+    ld1b            {z0.h}, p0/z, x0
+    add             x0, x0, x1
+    mul             z31.h, z0.h, z0.h
+    uaddlp          v1.4s, v31.8h
+.rept 7
+    ld1b            {z4.h}, p0/z, x0
+    add             x0, x0, x1
+    add             z0.h, z0.h, z4.h
+    mul             z31.h, z4.h, z4.h
+    uadalp          z1.s, p0/m, z31.h
+.endr
+    uaddlv          s0, v0.8h
+    uaddlv          d1, v1.4s
+    fmov            w0, s0
+    fmov            x1, d1
+    orr             x0, x0, x1, lsl #32
+    ret
+endfunc
+
+function PFX(pixel_var_16x16_sve2)
+    rdvl            x9, #1
+    cmp             x9, #16
+    bgt             .vl_gt_16_pixel_var_16x16
+    pixel_var_start
+    mov             w12, #16
+.loop_var_16_sve2:
+    sub             w12, w12, #1
+    ld1             {v4.16b}, x0, x1
+    pixel_var_1 v4
+    cbnz            w12, .loop_var_16_sve2
+    pixel_var_end
+    ret
+.vl_gt_16_pixel_var_16x16:
+    ptrue           p0.h, vl16
+    mov             z0.d, #0
+.rept 16
+    ld1b            {z4.h}, p0/z, x0
+    add             x0, x0, x1
+    add             z0.h, z0.h, z4.h
+    mul             z30.h, z4.h, z4.h
+    uadalp          z1.s, p0/m, z30.h
+.endr
+    uaddv           d0, p0, z0.h
+    uaddv           d1, p0, z1.s
+    fmov            w0, s0
+    fmov            x1, d1
+    orr             x0, x0, x1, lsl #32
+    ret
+endfunc
+
+function PFX(pixel_var_32x32_sve2)
+    rdvl            x9, #1
+    cmp             x9, #16
+    bgt             .vl_gt_16_pixel_var_32x32
+    pixel_var_start
+    mov             w12, #32
+.loop_var_32_sve2:
+    sub             w12, w12, #1
+    ld1             {v4.16b-v5.16b}, x0, x1
+    pixel_var_1 v4
+    pixel_var_1 v5
+    cbnz            w12, .loop_var_32_sve2
+    pixel_var_end
+    ret
+.vl_gt_16_pixel_var_32x32:
+    cmp             x9, #48
+    bgt             .vl_gt_48_pixel_var_32x32
+    ptrue           p0.b, vl32
+    mov             z0.d, #0
+    mov             z1.d, #0
+.rept 32
+    ld1b            {z4.b}, p0/z, x0
+    add             x0, x0, x1
+    uaddwb          z0.h, z0.h, z4.b
+    uaddwt          z0.h, z0.h, z4.b
+    umullb          z28.h, z4.b, z4.b
+    umullt          z29.h, z4.b, z4.b
+    uadalp          z1.s, p0/m, z28.h
+    uadalp          z1.s, p0/m, z29.h
+.endr
+    uaddv           d0, p0, z0.h
+    uaddv           d1, p0, z1.s
+    fmov            w0, s0
+    fmov            x1, d1
+    orr             x0, x0, x1, lsl #32
+    ret
+.vl_gt_48_pixel_var_32x32:
+    ptrue           p0.h, vl32
+    mov             z0.d, #0
+    mov             z1.d, #0
+.rept 32
+    ld1b            {z4.h}, p0/z, x0
+    add             x0, x0, x1
+    add             z0.h, z0.h, z4.h
+    mul             z28.h, z4.h, z4.h
+    uadalp          z1.s, p0/m, z28.h
+.endr
+    uaddv           d0, p0, z0.h
+    uaddv           d1, p0, z1.s
+    fmov            w0, s0
+    fmov            x1, d1
+    orr             x0, x0, x1, lsl #32
+    ret
+endfunc
+
+function PFX(pixel_var_64x64_sve2)
+    rdvl            x9, #1
+    cmp             x9, #16
+    bgt             .vl_gt_16_pixel_var_64x64
+    pixel_var_start
+    mov             w12, #64
+.loop_var_64_sve2:
+    sub             w12, w12, #1
+    ld1             {v4.16b-v7.16b}, x0, x1
+    pixel_var_1 v4
+    pixel_var_1 v5
+    pixel_var_1 v6
+    pixel_var_1 v7
+    cbnz            w12, .loop_var_64_sve2
+    pixel_var_end
+    ret
+.vl_gt_16_pixel_var_64x64:
+    cmp             x9, #48
+    bgt             .vl_gt_48_pixel_var_64x64
+    ptrue           p0.b, vl32
+    mov             z0.d, #0
+    mov             z2.d, #0
+.rept 64
+    ld1b            {z4.b}, p0/z, x0
+    ld1b            {z5.b}, p0/z, x0, #1, mul vl
+    add             x0, x0, x1
+    uaddwb          z0.h, z0.h, z4.b
+    uaddwt          z0.h, z0.h, z4.b
+    uaddwb          z0.h, z0.h, z5.b
+    uaddwt          z0.h, z0.h, z5.b
+    umullb          z24.h, z4.b, z4.b
+    umullt          z25.h, z4.b, z4.b
+    umullb          z26.h, z5.b, z5.b
+    umullt          z27.h, z5.b, z5.b
+    uadalp          z2.s, p0/m, z24.h
+    uadalp          z2.s, p0/m, z25.h
+    uadalp          z2.s, p0/m, z26.h
+    uadalp          z2.s, p0/m, z27.h
+.endr
+    uaddv           d0, p0, z0.h
+    uaddv           d1, p0, z2.s
+    fmov            w0, s0
+    fmov            x1, d1
+    orr             x0, x0, x1, lsl #32
+    ret
+.vl_gt_48_pixel_var_64x64:
+    cmp             x9, #112
+    bgt             .vl_gt_112_pixel_var_64x64
+    ptrue           p0.b, vl64
+    mov             z0.d, #0
+    mov             z1.d, #0
+.rept 64
+    ld1b            {z4.b}, p0/z, x0
​

x265_3.5.tar.gz/source/common/aarch64/pixel-util.S -> x265_3.6.tar.gz/source/common/aarch64/pixel-util.S Changed

@@ -1,8 +1,9 @@
 /*****************************************************************************
- * Copyright (C) 2020 MulticoreWare, Inc
+ * Copyright (C) 2020-2021 MulticoreWare, Inc
  *
  * Authors: Yimeng Su <yimeng.su@huawei.com>
  *          Hongbin Liu <liuhongbin1@huawei.com>
+ *          Sebastian Pop <spop@amazon.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -23,13 +24,652 @@
  *****************************************************************************/
 
 #include "asm.S"
+#include "pixel-util-common.S"
 
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
 .section .rodata
+#endif
 
 .align 4
 
 .text
 
+// uint64_t pixel_var(const pixel* pix, intptr_t i_stride)
+function PFX(pixel_var_8x8_neon)
+    ld1             {v4.8b}, x0, x1        // pixx
+    uxtl            v0.8h, v4.8b             // sum = pixx
+    umull           v1.8h, v4.8b, v4.8b
+    uaddlp          v1.4s, v1.8h             // sqr = pixx * pixx
+
+.rept 7
+    ld1             {v4.8b}, x0, x1        // pixx
+    umull           v31.8h, v4.8b, v4.8b
+    uaddw           v0.8h, v0.8h, v4.8b      // sum += pixx
+    uadalp          v1.4s, v31.8h            // sqr += pixx * pixx
+.endr
+    uaddlv          s0, v0.8h
+    uaddlv          d1, v1.4s
+    fmov            w0, s0
+    fmov            x1, d1
+    orr             x0, x0, x1, lsl #32      // return sum + ((uint64_t)sqr << 32);
+    ret
+endfunc
+
+function PFX(pixel_var_16x16_neon)
+    pixel_var_start
+    mov             w12, #16
+.loop_var_16:
+    sub             w12, w12, #1
+    ld1             {v4.16b}, x0, x1
+    pixel_var_1 v4
+    cbnz            w12, .loop_var_16
+    pixel_var_end
+    ret
+endfunc
+
+function PFX(pixel_var_32x32_neon)
+    pixel_var_start
+    mov             w12, #32
+.loop_var_32:
+    sub             w12, w12, #1
+    ld1             {v4.16b-v5.16b}, x0, x1
+    pixel_var_1 v4
+    pixel_var_1 v5
+    cbnz            w12, .loop_var_32
+    pixel_var_end
+    ret
+endfunc
+
+function PFX(pixel_var_64x64_neon)
+    pixel_var_start
+    mov             w12, #64
+.loop_var_64:
+    sub             w12, w12, #1
+    ld1             {v4.16b-v7.16b}, x0, x1
+    pixel_var_1 v4
+    pixel_var_1 v5
+    pixel_var_1 v6
+    pixel_var_1 v7
+    cbnz            w12, .loop_var_64
+    pixel_var_end
+    ret
+endfunc
+
+// void getResidual4_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride)
+function PFX(getResidual4_neon)
+    lsl             x4, x3, #1
+.rept 2
+    ld1             {v0.8b}, x0, x3
+    ld1             {v1.8b}, x1, x3
+    ld1             {v2.8b}, x0, x3
+    ld1             {v3.8b}, x1, x3
+    usubl           v4.8h, v0.8b, v1.8b
+    usubl           v5.8h, v2.8b, v3.8b
+    st1             {v4.8b}, x2, x4
+    st1             {v5.8b}, x2, x4
+.endr
+    ret
+endfunc
+
+function PFX(getResidual8_neon)
+    lsl             x4, x3, #1
+.rept 4
+    ld1             {v0.8b}, x0, x3
+    ld1             {v1.8b}, x1, x3
+    ld1             {v2.8b}, x0, x3
+    ld1             {v3.8b}, x1, x3
+    usubl           v4.8h, v0.8b, v1.8b
+    usubl           v5.8h, v2.8b, v3.8b
+    st1             {v4.16b}, x2, x4
+    st1             {v5.16b}, x2, x4
+.endr
+    ret
+endfunc
+
+function PFX(getResidual16_neon)
+    lsl             x4, x3, #1
+.rept 8
+    ld1             {v0.16b}, x0, x3
+    ld1             {v1.16b}, x1, x3
+    ld1             {v2.16b}, x0, x3
+    ld1             {v3.16b}, x1, x3
+    usubl           v4.8h, v0.8b, v1.8b
+    usubl2          v5.8h, v0.16b, v1.16b
+    usubl           v6.8h, v2.8b, v3.8b
+    usubl2          v7.8h, v2.16b, v3.16b
+    st1             {v4.8h-v5.8h}, x2, x4
+    st1             {v6.8h-v7.8h}, x2, x4
+.endr
+    ret
+endfunc
+
+function PFX(getResidual32_neon)
+    lsl             x4, x3, #1
+    mov             w12, #4
+.loop_residual_32:
+    sub             w12, w12, #1
+.rept 4
+    ld1             {v0.16b-v1.16b}, x0, x3
+    ld1             {v2.16b-v3.16b}, x1, x3
+    ld1             {v4.16b-v5.16b}, x0, x3
+    ld1             {v6.16b-v7.16b}, x1, x3
+    usubl           v16.8h, v0.8b, v2.8b
+    usubl2          v17.8h, v0.16b, v2.16b
+    usubl           v18.8h, v1.8b, v3.8b
+    usubl2          v19.8h, v1.16b, v3.16b
+    usubl           v20.8h, v4.8b, v6.8b
+    usubl2          v21.8h, v4.16b, v6.16b
+    usubl           v22.8h, v5.8b, v7.8b
+    usubl2          v23.8h, v5.16b, v7.16b
+    st1             {v16.8h-v19.8h}, x2, x4
+    st1             {v20.8h-v23.8h}, x2, x4
+.endr
+    cbnz            w12, .loop_residual_32
+    ret
+endfunc
+
+// void pixel_sub_ps_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1)
+function PFX(pixel_sub_ps_4x4_neon)
+    lsl             x1, x1, #1
+.rept 2
+    ld1             {v0.8b}, x2, x4
+    ld1             {v1.8b}, x3, x5
+    ld1             {v2.8b}, x2, x4
+    ld1             {v3.8b}, x3, x5
+    usubl           v4.8h, v0.8b, v1.8b
+    usubl           v5.8h, v2.8b, v3.8b
+    st1             {v4.4h}, x0, x1
+    st1             {v5.4h}, x0, x1
+.endr
+    ret
+endfunc
+
+function PFX(pixel_sub_ps_8x8_neon)
+    lsl             x1, x1, #1
+.rept 4
+    ld1             {v0.8b}, x2, x4
+    ld1             {v1.8b}, x3, x5
+    ld1             {v2.8b}, x2, x4
+    ld1             {v3.8b}, x3, x5
+    usubl           v4.8h, v0.8b, v1.8b
+    usubl           v5.8h, v2.8b, v3.8b
+    st1             {v4.8h}, x0, x1
+    st1             {v5.8h}, x0, x1
+.endr
+    ret
+endfunc
+
+function PFX(pixel_sub_ps_16x16_neon)
+    lsl             x1, x1, #1
+.rept 8
+    ld1             {v0.16b}, x2, x4
+    ld1             {v1.16b}, x3, x5
+    ld1             {v2.16b}, x2, x4
+    ld1             {v3.16b}, x3, x5
+    usubl           v4.8h, v0.8b, v1.8b

 
@@ -1,8 +1,9 @@
 /*****************************************************************************
- * Copyright (C) 2020 MulticoreWare, Inc
+ * Copyright (C) 2020-2021 MulticoreWare, Inc
  *
  * Authors: Yimeng Su <yimeng.su@huawei.com>
  *          Hongbin Liu <liuhongbin1@huawei.com>
+ *          Sebastian Pop <spop@amazon.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -23,13 +24,652 @@
  *****************************************************************************/
 
 #include "asm.S"
+#include "pixel-util-common.S"
 
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
 .section .rodata
+#endif
 
 .align 4
 
 .text
 
+// uint64_t pixel_var(const pixel* pix, intptr_t i_stride)
+function PFX(pixel_var_8x8_neon)
+    ld1             {v4.8b}, x0, x1        // pixx
+    uxtl            v0.8h, v4.8b             // sum = pixx
+    umull           v1.8h, v4.8b, v4.8b
+    uaddlp          v1.4s, v1.8h             // sqr = pixx * pixx
+
+.rept 7
+    ld1             {v4.8b}, x0, x1        // pixx
+    umull           v31.8h, v4.8b, v4.8b
+    uaddw           v0.8h, v0.8h, v4.8b      // sum += pixx
+    uadalp          v1.4s, v31.8h            // sqr += pixx * pixx
+.endr
+    uaddlv          s0, v0.8h
+    uaddlv          d1, v1.4s
+    fmov            w0, s0
+    fmov            x1, d1
+    orr             x0, x0, x1, lsl #32      // return sum + ((uint64_t)sqr << 32);
+    ret
+endfunc
+
+function PFX(pixel_var_16x16_neon)
+    pixel_var_start
+    mov             w12, #16
+.loop_var_16:
+    sub             w12, w12, #1
+    ld1             {v4.16b}, x0, x1
+    pixel_var_1 v4
+    cbnz            w12, .loop_var_16
+    pixel_var_end
+    ret
+endfunc
+
+function PFX(pixel_var_32x32_neon)
+    pixel_var_start
+    mov             w12, #32
+.loop_var_32:
+    sub             w12, w12, #1
+    ld1             {v4.16b-v5.16b}, x0, x1
+    pixel_var_1 v4
+    pixel_var_1 v5
+    cbnz            w12, .loop_var_32
+    pixel_var_end
+    ret
+endfunc
+
+function PFX(pixel_var_64x64_neon)
+    pixel_var_start
+    mov             w12, #64
+.loop_var_64:
+    sub             w12, w12, #1
+    ld1             {v4.16b-v7.16b}, x0, x1
+    pixel_var_1 v4
+    pixel_var_1 v5
+    pixel_var_1 v6
+    pixel_var_1 v7
+    cbnz            w12, .loop_var_64
+    pixel_var_end
+    ret
+endfunc
+
+// void getResidual4_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride)
+function PFX(getResidual4_neon)
+    lsl             x4, x3, #1
+.rept 2
+    ld1             {v0.8b}, x0, x3
+    ld1             {v1.8b}, x1, x3
+    ld1             {v2.8b}, x0, x3
+    ld1             {v3.8b}, x1, x3
+    usubl           v4.8h, v0.8b, v1.8b
+    usubl           v5.8h, v2.8b, v3.8b
+    st1             {v4.8b}, x2, x4
+    st1             {v5.8b}, x2, x4
+.endr
+    ret
+endfunc
+
+function PFX(getResidual8_neon)
+    lsl             x4, x3, #1
+.rept 4
+    ld1             {v0.8b}, x0, x3
+    ld1             {v1.8b}, x1, x3
+    ld1             {v2.8b}, x0, x3
+    ld1             {v3.8b}, x1, x3
+    usubl           v4.8h, v0.8b, v1.8b
+    usubl           v5.8h, v2.8b, v3.8b
+    st1             {v4.16b}, x2, x4
+    st1             {v5.16b}, x2, x4
+.endr
+    ret
+endfunc
+
+function PFX(getResidual16_neon)
+    lsl             x4, x3, #1
+.rept 8
+    ld1             {v0.16b}, x0, x3
+    ld1             {v1.16b}, x1, x3
+    ld1             {v2.16b}, x0, x3
+    ld1             {v3.16b}, x1, x3
+    usubl           v4.8h, v0.8b, v1.8b
+    usubl2          v5.8h, v0.16b, v1.16b
+    usubl           v6.8h, v2.8b, v3.8b
+    usubl2          v7.8h, v2.16b, v3.16b
+    st1             {v4.8h-v5.8h}, x2, x4
+    st1             {v6.8h-v7.8h}, x2, x4
+.endr
+    ret
+endfunc
+
+function PFX(getResidual32_neon)
+    lsl             x4, x3, #1
+    mov             w12, #4
+.loop_residual_32:
+    sub             w12, w12, #1
+.rept 4
+    ld1             {v0.16b-v1.16b}, x0, x3
+    ld1             {v2.16b-v3.16b}, x1, x3
+    ld1             {v4.16b-v5.16b}, x0, x3
+    ld1             {v6.16b-v7.16b}, x1, x3
+    usubl           v16.8h, v0.8b, v2.8b
+    usubl2          v17.8h, v0.16b, v2.16b
+    usubl           v18.8h, v1.8b, v3.8b
+    usubl2          v19.8h, v1.16b, v3.16b
+    usubl           v20.8h, v4.8b, v6.8b
+    usubl2          v21.8h, v4.16b, v6.16b
+    usubl           v22.8h, v5.8b, v7.8b
+    usubl2          v23.8h, v5.16b, v7.16b
+    st1             {v16.8h-v19.8h}, x2, x4
+    st1             {v20.8h-v23.8h}, x2, x4
+.endr
+    cbnz            w12, .loop_residual_32
+    ret
+endfunc
+
+// void pixel_sub_ps_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1)
+function PFX(pixel_sub_ps_4x4_neon)
+    lsl             x1, x1, #1
+.rept 2
+    ld1             {v0.8b}, x2, x4
+    ld1             {v1.8b}, x3, x5
+    ld1             {v2.8b}, x2, x4
+    ld1             {v3.8b}, x3, x5
+    usubl           v4.8h, v0.8b, v1.8b
+    usubl           v5.8h, v2.8b, v3.8b
+    st1             {v4.4h}, x0, x1
+    st1             {v5.4h}, x0, x1
+.endr
+    ret
+endfunc
+
+function PFX(pixel_sub_ps_8x8_neon)
+    lsl             x1, x1, #1
+.rept 4
+    ld1             {v0.8b}, x2, x4
+    ld1             {v1.8b}, x3, x5
+    ld1             {v2.8b}, x2, x4
+    ld1             {v3.8b}, x3, x5
+    usubl           v4.8h, v0.8b, v1.8b
+    usubl           v5.8h, v2.8b, v3.8b
+    st1             {v4.8h}, x0, x1
+    st1             {v5.8h}, x0, x1
+.endr
+    ret
+endfunc
+
+function PFX(pixel_sub_ps_16x16_neon)
+    lsl             x1, x1, #1
+.rept 8
+    ld1             {v0.16b}, x2, x4
+    ld1             {v1.16b}, x3, x5
+    ld1             {v2.16b}, x2, x4
+    ld1             {v3.16b}, x3, x5
+    usubl           v4.8h, v0.8b, v1.8b
​

x265_3.6.tar.gz/source/common/aarch64/sad-a-common.S Added

@@ -0,0 +1,514 @@
+/*****************************************************************************
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
+ *
+ * Authors: David Chen <david.chen@myais.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+// This file contains the macros written using NEON instruction set
+// that are also used by the SVE2 functions
+
+#include "asm.S"
+
+.arch           armv8-a
+
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
+.section .rodata
+#endif
+
+.align 4
+
+.macro SAD_START_4 f
+    ld1             {v0.s}0, x0, x1
+    ld1             {v0.s}1, x0, x1
+    ld1             {v1.s}0, x2, x3
+    ld1             {v1.s}1, x2, x3
+    \f              v16.8h, v0.8b, v1.8b
+.endm
+
+.macro SAD_4 h
+.rept \h / 2 - 1
+    SAD_START_4 uabal
+.endr
+.endm
+
+.macro SAD_START_8 f
+    ld1             {v0.8b}, x0, x1
+    ld1             {v1.8b}, x2, x3
+    ld1             {v2.8b}, x0, x1
+    ld1             {v3.8b}, x2, x3
+    \f              v16.8h, v0.8b, v1.8b
+    \f              v17.8h, v2.8b, v3.8b
+.endm
+
+.macro SAD_8 h
+.rept \h / 2 - 1
+    SAD_START_8 uabal
+.endr
+.endm
+
+.macro SAD_START_16 f
+    ld1             {v0.16b}, x0, x1
+    ld1             {v1.16b}, x2, x3
+    ld1             {v2.16b}, x0, x1
+    ld1             {v3.16b}, x2, x3
+    \f              v16.8h, v0.8b, v1.8b
+    \f\()2          v17.8h, v0.16b, v1.16b
+    uabal           v16.8h, v2.8b, v3.8b
+    uabal2          v17.8h, v2.16b, v3.16b
+.endm
+
+.macro SAD_16 h
+.rept \h / 2 - 1
+    SAD_START_16 uabal
+.endr
+.endm
+
+.macro SAD_START_32
+    movi            v16.16b, #0
+    movi            v17.16b, #0
+    movi            v18.16b, #0
+    movi            v19.16b, #0
+.endm
+
+.macro SAD_32
+    ld1             {v0.16b-v1.16b}, x0, x1
+    ld1             {v2.16b-v3.16b}, x2, x3
+    ld1             {v4.16b-v5.16b}, x0, x1
+    ld1             {v6.16b-v7.16b}, x2, x3
+    uabal           v16.8h, v0.8b, v2.8b
+    uabal2          v17.8h, v0.16b, v2.16b
+    uabal           v18.8h, v1.8b, v3.8b
+    uabal2          v19.8h, v1.16b, v3.16b
+    uabal           v16.8h, v4.8b, v6.8b
+    uabal2          v17.8h, v4.16b, v6.16b
+    uabal           v18.8h, v5.8b, v7.8b
+    uabal2          v19.8h, v5.16b, v7.16b
+.endm
+
+.macro SAD_END_32
+    add             v16.8h, v16.8h, v17.8h
+    add             v17.8h, v18.8h, v19.8h
+    add             v16.8h, v16.8h, v17.8h
+    uaddlv          s0, v16.8h
+    fmov            w0, s0
+    ret
+.endm
+
+.macro SAD_START_64
+    movi            v16.16b, #0
+    movi            v17.16b, #0
+    movi            v18.16b, #0
+    movi            v19.16b, #0
+    movi            v20.16b, #0
+    movi            v21.16b, #0
+    movi            v22.16b, #0
+    movi            v23.16b, #0
+.endm
+
+.macro SAD_64
+    ld1             {v0.16b-v3.16b}, x0, x1
+    ld1             {v4.16b-v7.16b}, x2, x3
+    ld1             {v24.16b-v27.16b}, x0, x1
+    ld1             {v28.16b-v31.16b}, x2, x3
+    uabal           v16.8h, v0.8b, v4.8b
+    uabal2          v17.8h, v0.16b, v4.16b
+    uabal           v18.8h, v1.8b, v5.8b
+    uabal2          v19.8h, v1.16b, v5.16b
+    uabal           v20.8h, v2.8b, v6.8b
+    uabal2          v21.8h, v2.16b, v6.16b
+    uabal           v22.8h, v3.8b, v7.8b
+    uabal2          v23.8h, v3.16b, v7.16b
+
+    uabal           v16.8h, v24.8b, v28.8b
+    uabal2          v17.8h, v24.16b, v28.16b
+    uabal           v18.8h, v25.8b, v29.8b
+    uabal2          v19.8h, v25.16b, v29.16b
+    uabal           v20.8h, v26.8b, v30.8b
+    uabal2          v21.8h, v26.16b, v30.16b
+    uabal           v22.8h, v27.8b, v31.8b
+    uabal2          v23.8h, v27.16b, v31.16b
+.endm
+
+.macro SAD_END_64
+    add             v16.8h, v16.8h, v17.8h
+    add             v17.8h, v18.8h, v19.8h
+    add             v16.8h, v16.8h, v17.8h
+    uaddlp          v16.4s, v16.8h
+    add             v18.8h, v20.8h, v21.8h
+    add             v19.8h, v22.8h, v23.8h
+    add             v17.8h, v18.8h, v19.8h
+    uaddlp          v17.4s, v17.8h
+    add             v16.4s, v16.4s, v17.4s
+    uaddlv          d0, v16.4s
+    fmov            x0, d0
+    ret
+.endm
+
+.macro SAD_START_12
+    movrel          x12, sad12_mask
+    ld1             {v31.16b}, x12
+    movi            v16.16b, #0
+    movi            v17.16b, #0
+.endm
+
+.macro SAD_12
+    ld1             {v0.16b}, x0, x1
+    and             v0.16b, v0.16b, v31.16b
+    ld1             {v1.16b}, x2, x3
+    and             v1.16b, v1.16b, v31.16b
+    ld1             {v2.16b}, x0, x1
+    and             v2.16b, v2.16b, v31.16b
+    ld1             {v3.16b}, x2, x3
+    and             v3.16b, v3.16b, v31.16b
+    uabal           v16.8h, v0.8b, v1.8b
+    uabal2          v17.8h, v0.16b, v1.16b
+    uabal           v16.8h, v2.8b, v3.8b
+    uabal2          v17.8h, v2.16b, v3.16b
+.endm
+
+.macro SAD_END_12
+    add             v16.8h, v16.8h, v17.8h
+    uaddlv          s0, v16.8h
+    fmov            w0, s0
+    ret
+.endm
+
+.macro SAD_START_24
+    movi            v16.16b, #0
+    movi            v17.16b, #0
+    movi            v18.16b, #0
+    sub             x1, x1, #16

 
@@ -0,0 +1,514 @@
+/*****************************************************************************
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
+ *
+ * Authors: David Chen <david.chen@myais.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+// This file contains the macros written using NEON instruction set
+// that are also used by the SVE2 functions
+
+#include "asm.S"
+
+.arch           armv8-a
+
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
+.section .rodata
+#endif
+
+.align 4
+
+.macro SAD_START_4 f
+    ld1             {v0.s}0, x0, x1
+    ld1             {v0.s}1, x0, x1
+    ld1             {v1.s}0, x2, x3
+    ld1             {v1.s}1, x2, x3
+    \f              v16.8h, v0.8b, v1.8b
+.endm
+
+.macro SAD_4 h
+.rept \h / 2 - 1
+    SAD_START_4 uabal
+.endr
+.endm
+
+.macro SAD_START_8 f
+    ld1             {v0.8b}, x0, x1
+    ld1             {v1.8b}, x2, x3
+    ld1             {v2.8b}, x0, x1
+    ld1             {v3.8b}, x2, x3
+    \f              v16.8h, v0.8b, v1.8b
+    \f              v17.8h, v2.8b, v3.8b
+.endm
+
+.macro SAD_8 h
+.rept \h / 2 - 1
+    SAD_START_8 uabal
+.endr
+.endm
+
+.macro SAD_START_16 f
+    ld1             {v0.16b}, x0, x1
+    ld1             {v1.16b}, x2, x3
+    ld1             {v2.16b}, x0, x1
+    ld1             {v3.16b}, x2, x3
+    \f              v16.8h, v0.8b, v1.8b
+    \f\()2          v17.8h, v0.16b, v1.16b
+    uabal           v16.8h, v2.8b, v3.8b
+    uabal2          v17.8h, v2.16b, v3.16b
+.endm
+
+.macro SAD_16 h
+.rept \h / 2 - 1
+    SAD_START_16 uabal
+.endr
+.endm
+
+.macro SAD_START_32
+    movi            v16.16b, #0
+    movi            v17.16b, #0
+    movi            v18.16b, #0
+    movi            v19.16b, #0
+.endm
+
+.macro SAD_32
+    ld1             {v0.16b-v1.16b}, x0, x1
+    ld1             {v2.16b-v3.16b}, x2, x3
+    ld1             {v4.16b-v5.16b}, x0, x1
+    ld1             {v6.16b-v7.16b}, x2, x3
+    uabal           v16.8h, v0.8b, v2.8b
+    uabal2          v17.8h, v0.16b, v2.16b
+    uabal           v18.8h, v1.8b, v3.8b
+    uabal2          v19.8h, v1.16b, v3.16b
+    uabal           v16.8h, v4.8b, v6.8b
+    uabal2          v17.8h, v4.16b, v6.16b
+    uabal           v18.8h, v5.8b, v7.8b
+    uabal2          v19.8h, v5.16b, v7.16b
+.endm
+
+.macro SAD_END_32
+    add             v16.8h, v16.8h, v17.8h
+    add             v17.8h, v18.8h, v19.8h
+    add             v16.8h, v16.8h, v17.8h
+    uaddlv          s0, v16.8h
+    fmov            w0, s0
+    ret
+.endm
+
+.macro SAD_START_64
+    movi            v16.16b, #0
+    movi            v17.16b, #0
+    movi            v18.16b, #0
+    movi            v19.16b, #0
+    movi            v20.16b, #0
+    movi            v21.16b, #0
+    movi            v22.16b, #0
+    movi            v23.16b, #0
+.endm
+
+.macro SAD_64
+    ld1             {v0.16b-v3.16b}, x0, x1
+    ld1             {v4.16b-v7.16b}, x2, x3
+    ld1             {v24.16b-v27.16b}, x0, x1
+    ld1             {v28.16b-v31.16b}, x2, x3
+    uabal           v16.8h, v0.8b, v4.8b
+    uabal2          v17.8h, v0.16b, v4.16b
+    uabal           v18.8h, v1.8b, v5.8b
+    uabal2          v19.8h, v1.16b, v5.16b
+    uabal           v20.8h, v2.8b, v6.8b
+    uabal2          v21.8h, v2.16b, v6.16b
+    uabal           v22.8h, v3.8b, v7.8b
+    uabal2          v23.8h, v3.16b, v7.16b
+
+    uabal           v16.8h, v24.8b, v28.8b
+    uabal2          v17.8h, v24.16b, v28.16b
+    uabal           v18.8h, v25.8b, v29.8b
+    uabal2          v19.8h, v25.16b, v29.16b
+    uabal           v20.8h, v26.8b, v30.8b
+    uabal2          v21.8h, v26.16b, v30.16b
+    uabal           v22.8h, v27.8b, v31.8b
+    uabal2          v23.8h, v27.16b, v31.16b
+.endm
+
+.macro SAD_END_64
+    add             v16.8h, v16.8h, v17.8h
+    add             v17.8h, v18.8h, v19.8h
+    add             v16.8h, v16.8h, v17.8h
+    uaddlp          v16.4s, v16.8h
+    add             v18.8h, v20.8h, v21.8h
+    add             v19.8h, v22.8h, v23.8h
+    add             v17.8h, v18.8h, v19.8h
+    uaddlp          v17.4s, v17.8h
+    add             v16.4s, v16.4s, v17.4s
+    uaddlv          d0, v16.4s
+    fmov            x0, d0
+    ret
+.endm
+
+.macro SAD_START_12
+    movrel          x12, sad12_mask
+    ld1             {v31.16b}, x12
+    movi            v16.16b, #0
+    movi            v17.16b, #0
+.endm
+
+.macro SAD_12
+    ld1             {v0.16b}, x0, x1
+    and             v0.16b, v0.16b, v31.16b
+    ld1             {v1.16b}, x2, x3
+    and             v1.16b, v1.16b, v31.16b
+    ld1             {v2.16b}, x0, x1
+    and             v2.16b, v2.16b, v31.16b
+    ld1             {v3.16b}, x2, x3
+    and             v3.16b, v3.16b, v31.16b
+    uabal           v16.8h, v0.8b, v1.8b
+    uabal2          v17.8h, v0.16b, v1.16b
+    uabal           v16.8h, v2.8b, v3.8b
+    uabal2          v17.8h, v2.16b, v3.16b
+.endm
+
+.macro SAD_END_12
+    add             v16.8h, v16.8h, v17.8h
+    uaddlv          s0, v16.8h
+    fmov            w0, s0
+    ret
+.endm
+
+.macro SAD_START_24
+    movi            v16.16b, #0
+    movi            v17.16b, #0
+    movi            v18.16b, #0
+    sub             x1, x1, #16
​

x265_3.6.tar.gz/source/common/aarch64/sad-a-sve2.S Added

@@ -0,0 +1,511 @@
+/*****************************************************************************
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
+ *
+ * Authors: David Chen <david.chen@myais.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm-sve.S"
+#include "sad-a-common.S"
+
+.arch armv8-a+sve2
+
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
+.section .rodata
+#endif
+
+.align 4
+
+.text
+
+.macro SAD_SVE2_16 h
+    mov             z16.d, #0
+    ptrue           p0.h, vl16
+.rept \h
+    ld1b            {z0.h}, p0/z, x0
+    ld1b            {z2.h}, p0/z, x2
+    add             x0, x0, x1
+    add             x2, x2, x3
+    uaba            z16.h, z0.h, z2.h
+.endr
+    uaddv           d0, p0, z16.h
+    fmov            w0, s0
+    ret
+.endm
+
+.macro SAD_SVE2_32 h
+    ptrue           p0.b, vl32
+.rept \h
+    ld1b            {z0.b}, p0/z, x0
+    ld1b            {z4.b}, p0/z, x2
+    add             x0, x0, x1
+    add             x2, x2, x3
+    uabalb          z16.h, z0.b, z4.b
+    uabalt          z16.h, z0.b, z4.b
+.endr
+    uaddv           d0, p0, z16.h
+    fmov            w0, s0
+    ret
+.endm
+
+.macro SAD_SVE2_64 h
+    cmp             x9, #48
+    bgt             .vl_gt_48_pixel_sad_64x\h
+    mov             z16.d, #0
+    mov             z17.d, #0
+    mov             z18.d, #0
+    mov             z19.d, #0
+    ptrue           p0.b, vl32
+.rept \h
+    ld1b            {z0.b}, p0/z, x0
+    ld1b            {z1.b}, p0/z, x0, #1, mul vl
+    ld1b            {z4.b}, p0/z, x2
+    ld1b            {z5.b}, p0/z, x2, #1, mul vl
+    add             x0, x0, x1
+    add             x2, x2, x3
+    uabalb          z16.h, z0.b, z4.b
+    uabalt          z17.h, z0.b, z4.b
+    uabalb          z18.h, z1.b, z5.b
+    uabalt          z19.h, z1.b, z5.b
+.endr
+    add             z16.h, z16.h, z17.h
+    add             z17.h, z18.h, z19.h
+    add             z16.h, z16.h, z17.h
+    uadalp          z24.s, p0/m, z16.h
+    uaddv           d5, p0, z24.s
+    fmov            x0, d5
+    ret
+.vl_gt_48_pixel_sad_64x\h\():
+    mov             z16.d, #0
+    mov             z17.d, #0
+    mov             z24.d, #0
+    ptrue           p0.b, vl64
+.rept \h
+    ld1b            {z0.b}, p0/z, x0
+    ld1b            {z4.b}, p0/z, x2
+    add             x0, x0, x1
+    add             x2, x2, x3
+    uabalb          z16.h, z0.b, z4.b
+    uabalt          z17.h, z0.b, z4.b
+.endr
+    add             z16.h, z16.h, z17.h
+    uadalp          z24.s, p0/m, z16.h
+    uaddv           d5, p0, z24.s
+    fmov            x0, d5
+    ret
+.endm
+
+.macro SAD_SVE2_24 h
+    mov             z16.d, #0
+    mov             x10, #24
+    mov             x11, #0
+    whilelt         p0.b, x11, x10
+.rept \h
+    ld1b            {z0.b}, p0/z, x0
+    ld1b            {z8.b}, p0/z, x2
+    add             x0, x0, x1
+    add             x2, x2, x3
+    uabalb          z16.h, z0.b, z8.b
+    uabalt          z16.h, z0.b, z8.b
+.endr
+    uaddv           d5, p0, z16.h
+    fmov            w0, s5
+    ret
+.endm
+
+.macro SAD_SVE2_48 h
+    cmp             x9, #48
+    bgt             .vl_gt_48_pixel_sad_48x\h
+    mov             z16.d, #0
+    mov             z17.d, #0
+    mov             z18.d, #0
+    mov             z19.d, #0
+    ptrue           p0.b, vl32
+    ptrue           p1.b, vl16
+.rept \h
+    ld1b            {z0.b}, p0/z, x0
+    ld1b            {z1.b}, p1/z, x0, #1, mul vl
+    ld1b            {z8.b}, p0/z, x2
+    ld1b            {z9.b}, p1/z, x2, #1, mul vl
+    add             x0, x0, x1
+    add             x2, x2, x3
+    uabalb          z16.h, z0.b, z8.b
+    uabalt          z17.h, z0.b, z8.b
+    uabalb          z18.h, z1.b, z9.b
+    uabalt          z19.h, z1.b, z9.b
+.endr
+    add             z16.h, z16.h, z17.h
+    add             z17.h, z18.h, z19.h
+    add             z16.h, z16.h, z17.h
+    uaddv           d5, p0, z16.h
+    fmov            w0, s5
+    ret
+.vl_gt_48_pixel_sad_48x\h\():
+    mov             z16.d, #0
+    mov             z17.d, #0
+    mov             x10, #48
+    mov             x11, #0
+    whilelt         p0.b, x11, x10
+.rept \h
+    ld1b            {z0.b}, p0/z, x0
+    ld1b            {z8.b}, p0/z, x2
+    add             x0, x0, x1
+    add             x2, x2, x3
+    uabalb          z16.h, z0.b, z8.b
+    uabalt          z17.h, z0.b, z8.b
+.endr
+    add             z16.h, z16.h, z17.h
+    uaddv           d5, p0, z16.h
+    fmov            w0, s5
+    ret
+.endm
+
+// Fully unrolled.
+.macro SAD_FUNC_SVE2 w, h
+function PFX(pixel_sad_\w\()x\h\()_sve2)
+    rdvl            x9, #1
+    cmp             x9, #16
+    bgt             .vl_gt_16_pixel_sad_\w\()x\h
+    SAD_START_\w uabdl
+    SAD_\w \h
+.if \w > 4
+    add             v16.8h, v16.8h, v17.8h
+.endif
+    uaddlv          s0, v16.8h
+    fmov            w0, s0
+    ret
+.vl_gt_16_pixel_sad_\w\()x\h\():
+.if \w == 4 || \w == 8 || \w == 12
+    SAD_START_\w uabdl
+    SAD_\w \h
+.if \w > 4

 
@@ -0,0 +1,511 @@
+/*****************************************************************************
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
+ *
+ * Authors: David Chen <david.chen@myais.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm-sve.S"
+#include "sad-a-common.S"
+
+.arch armv8-a+sve2
+
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
+.section .rodata
+#endif
+
+.align 4
+
+.text
+
+.macro SAD_SVE2_16 h
+    mov             z16.d, #0
+    ptrue           p0.h, vl16
+.rept \h
+    ld1b            {z0.h}, p0/z, x0
+    ld1b            {z2.h}, p0/z, x2
+    add             x0, x0, x1
+    add             x2, x2, x3
+    uaba            z16.h, z0.h, z2.h
+.endr
+    uaddv           d0, p0, z16.h
+    fmov            w0, s0
+    ret
+.endm
+
+.macro SAD_SVE2_32 h
+    ptrue           p0.b, vl32
+.rept \h
+    ld1b            {z0.b}, p0/z, x0
+    ld1b            {z4.b}, p0/z, x2
+    add             x0, x0, x1
+    add             x2, x2, x3
+    uabalb          z16.h, z0.b, z4.b
+    uabalt          z16.h, z0.b, z4.b
+.endr
+    uaddv           d0, p0, z16.h
+    fmov            w0, s0
+    ret
+.endm
+
+.macro SAD_SVE2_64 h
+    cmp             x9, #48
+    bgt             .vl_gt_48_pixel_sad_64x\h
+    mov             z16.d, #0
+    mov             z17.d, #0
+    mov             z18.d, #0
+    mov             z19.d, #0
+    ptrue           p0.b, vl32
+.rept \h
+    ld1b            {z0.b}, p0/z, x0
+    ld1b            {z1.b}, p0/z, x0, #1, mul vl
+    ld1b            {z4.b}, p0/z, x2
+    ld1b            {z5.b}, p0/z, x2, #1, mul vl
+    add             x0, x0, x1
+    add             x2, x2, x3
+    uabalb          z16.h, z0.b, z4.b
+    uabalt          z17.h, z0.b, z4.b
+    uabalb          z18.h, z1.b, z5.b
+    uabalt          z19.h, z1.b, z5.b
+.endr
+    add             z16.h, z16.h, z17.h
+    add             z17.h, z18.h, z19.h
+    add             z16.h, z16.h, z17.h
+    uadalp          z24.s, p0/m, z16.h
+    uaddv           d5, p0, z24.s
+    fmov            x0, d5
+    ret
+.vl_gt_48_pixel_sad_64x\h\():
+    mov             z16.d, #0
+    mov             z17.d, #0
+    mov             z24.d, #0
+    ptrue           p0.b, vl64
+.rept \h
+    ld1b            {z0.b}, p0/z, x0
+    ld1b            {z4.b}, p0/z, x2
+    add             x0, x0, x1
+    add             x2, x2, x3
+    uabalb          z16.h, z0.b, z4.b
+    uabalt          z17.h, z0.b, z4.b
+.endr
+    add             z16.h, z16.h, z17.h
+    uadalp          z24.s, p0/m, z16.h
+    uaddv           d5, p0, z24.s
+    fmov            x0, d5
+    ret
+.endm
+
+.macro SAD_SVE2_24 h
+    mov             z16.d, #0
+    mov             x10, #24
+    mov             x11, #0
+    whilelt         p0.b, x11, x10
+.rept \h
+    ld1b            {z0.b}, p0/z, x0
+    ld1b            {z8.b}, p0/z, x2
+    add             x0, x0, x1
+    add             x2, x2, x3
+    uabalb          z16.h, z0.b, z8.b
+    uabalt          z16.h, z0.b, z8.b
+.endr
+    uaddv           d5, p0, z16.h
+    fmov            w0, s5
+    ret
+.endm
+
+.macro SAD_SVE2_48 h
+    cmp             x9, #48
+    bgt             .vl_gt_48_pixel_sad_48x\h
+    mov             z16.d, #0
+    mov             z17.d, #0
+    mov             z18.d, #0
+    mov             z19.d, #0
+    ptrue           p0.b, vl32
+    ptrue           p1.b, vl16
+.rept \h
+    ld1b            {z0.b}, p0/z, x0
+    ld1b            {z1.b}, p1/z, x0, #1, mul vl
+    ld1b            {z8.b}, p0/z, x2
+    ld1b            {z9.b}, p1/z, x2, #1, mul vl
+    add             x0, x0, x1
+    add             x2, x2, x3
+    uabalb          z16.h, z0.b, z8.b
+    uabalt          z17.h, z0.b, z8.b
+    uabalb          z18.h, z1.b, z9.b
+    uabalt          z19.h, z1.b, z9.b
+.endr
+    add             z16.h, z16.h, z17.h
+    add             z17.h, z18.h, z19.h
+    add             z16.h, z16.h, z17.h
+    uaddv           d5, p0, z16.h
+    fmov            w0, s5
+    ret
+.vl_gt_48_pixel_sad_48x\h\():
+    mov             z16.d, #0
+    mov             z17.d, #0
+    mov             x10, #48
+    mov             x11, #0
+    whilelt         p0.b, x11, x10
+.rept \h
+    ld1b            {z0.b}, p0/z, x0
+    ld1b            {z8.b}, p0/z, x2
+    add             x0, x0, x1
+    add             x2, x2, x3
+    uabalb          z16.h, z0.b, z8.b
+    uabalt          z17.h, z0.b, z8.b
+.endr
+    add             z16.h, z16.h, z17.h
+    uaddv           d5, p0, z16.h
+    fmov            w0, s5
+    ret
+.endm
+
+// Fully unrolled.
+.macro SAD_FUNC_SVE2 w, h
+function PFX(pixel_sad_\w\()x\h\()_sve2)
+    rdvl            x9, #1
+    cmp             x9, #16
+    bgt             .vl_gt_16_pixel_sad_\w\()x\h
+    SAD_START_\w uabdl
+    SAD_\w \h
+.if \w > 4
+    add             v16.8h, v16.8h, v17.8h
+.endif
+    uaddlv          s0, v16.8h
+    fmov            w0, s0
+    ret
+.vl_gt_16_pixel_sad_\w\()x\h\():
+.if \w == 4 || \w == 8 || \w == 12
+    SAD_START_\w uabdl
+    SAD_\w \h
+.if \w > 4
​

x265_3.5.tar.gz/source/common/aarch64/sad-a.S -> x265_3.6.tar.gz/source/common/aarch64/sad-a.S Changed

@@ -1,7 +1,8 @@
 /*****************************************************************************
- * Copyright (C) 2020 MulticoreWare, Inc
+ * Copyright (C) 2020-2021 MulticoreWare, Inc
  *
  * Authors: Hongbin Liu <liuhongbin1@huawei.com>
+ *          Sebastian Pop <spop@amazon.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -22,84 +23,186 @@
  *****************************************************************************/
 
 #include "asm.S"
+#include "sad-a-common.S"
 
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
 .section .rodata
+#endif
 
 .align 4
 
 .text
 
-.macro SAD_X_START_8 x
-    ld1             {v0.8b}, x0, x9
-.if \x == 3
-    ld1             {v1.8b}, x1, x4
-    ld1             {v2.8b}, x2, x4
-    ld1             {v3.8b}, x3, x4
-.elseif \x == 4
-    ld1             {v1.8b}, x1, x5
-    ld1             {v2.8b}, x2, x5
-    ld1             {v3.8b}, x3, x5
-    ld1             {v4.8b}, x4, x5
-.endif
-    uabdl           v16.8h, v0.8b, v1.8b
-    uabdl           v17.8h, v0.8b, v2.8b
-    uabdl           v18.8h, v0.8b, v3.8b
-.if \x == 4
-    uabdl           v19.8h, v0.8b, v4.8b
+// Fully unrolled.
+.macro SAD_FUNC w, h
+function PFX(pixel_sad_\w\()x\h\()_neon)
+    SAD_START_\w uabdl
+    SAD_\w \h
+.if \w > 4
+    add             v16.8h, v16.8h, v17.8h
 .endif
+    uaddlv          s0, v16.8h
+    fmov            w0, s0
+    ret
+endfunc
+.endm
+
+// Loop unrolled 4.
+.macro SAD_FUNC_LOOP w, h
+function PFX(pixel_sad_\w\()x\h\()_neon)
+    SAD_START_\w
+
+    mov             w9, #\h/8
+.loop_\w\()x\h:
+    sub             w9, w9, #1
+.rept 4
+    SAD_\w
+.endr
+    cbnz            w9, .loop_\w\()x\h
+
+    SAD_END_\w
+endfunc
 .endm
 
-.macro SAD_X_8 x
-    ld1             {v0.8b}, x0, x9
+SAD_FUNC  4,  4
+SAD_FUNC  4,  8
+SAD_FUNC  4,  16
+SAD_FUNC  8,  4
+SAD_FUNC  8,  8
+SAD_FUNC  8,  16
+SAD_FUNC  8,  32
+SAD_FUNC  16, 4
+SAD_FUNC  16, 8
+SAD_FUNC  16, 12
+SAD_FUNC  16, 16
+SAD_FUNC  16, 32
+SAD_FUNC  16, 64
+
+SAD_FUNC_LOOP  32, 8
+SAD_FUNC_LOOP  32, 16
+SAD_FUNC_LOOP  32, 24
+SAD_FUNC_LOOP  32, 32
+SAD_FUNC_LOOP  32, 64
+SAD_FUNC_LOOP  64, 16
+SAD_FUNC_LOOP  64, 32
+SAD_FUNC_LOOP  64, 48
+SAD_FUNC_LOOP  64, 64
+SAD_FUNC_LOOP  12, 16
+SAD_FUNC_LOOP  24, 32
+SAD_FUNC_LOOP  48, 64
+
+// SAD_X3 and SAD_X4 code start
+
+// static void x264_pixel_sad_x3_##size(pixel *fenc, pixel *pix0, pixel *pix1, pixel *pix2, intptr_t i_stride, int scores3)
+// static void x264_pixel_sad_x4_##size(pixel *fenc, pixel *pix0, pixel *pix1,pixel *pix2, pixel *pix3, intptr_t i_stride, int scores4)
+.macro SAD_X_FUNC x, w, h
+function PFX(sad_x\x\()_\w\()x\h\()_neon)
+    mov             x9, #FENC_STRIDE
+
+// Make function arguments for x == 3 look like x == 4.
 .if \x == 3
-    ld1             {v1.8b}, x1, x4
-    ld1             {v2.8b}, x2, x4
-    ld1             {v3.8b}, x3, x4
-.elseif \x == 4
-    ld1             {v1.8b}, x1, x5
-    ld1             {v2.8b}, x2, x5
-    ld1             {v3.8b}, x3, x5
-    ld1             {v4.8b}, x4, x5
+    mov             x6, x5
+    mov             x5, x4
 .endif
-    uabal           v16.8h, v0.8b, v1.8b
-    uabal           v17.8h, v0.8b, v2.8b
-    uabal           v18.8h, v0.8b, v3.8b
-.if \x == 4
-    uabal           v19.8h, v0.8b, v4.8b
+
+.if \w == 12
+    movrel          x12, sad12_mask
+    ld1             {v31.16b}, x12
 .endif
+
+    SAD_X_START_\w \h, \x, uabdl
+    SAD_X_\w \h, \x
+    SAD_X_END_\w \x
+endfunc
 .endm
 
-.macro SAD_X_8xN x, h
-function x265_sad_x\x\()_8x\h\()_neon
+.macro SAD_X_LOOP x, w, h
+function PFX(sad_x\x\()_\w\()x\h\()_neon)
     mov             x9, #FENC_STRIDE
-    SAD_X_START_8 \x
-.rept \h - 1
-    SAD_X_8 \x
-.endr
-    uaddlv          s0, v16.8h
-    uaddlv          s1, v17.8h
-    uaddlv          s2, v18.8h
-.if \x == 4
-    uaddlv          s3, v19.8h
-.endif
 
+// Make function arguments for x == 3 look like x == 4.
 .if \x == 3
-    stp             s0, s1, x5
-    str             s2, x5, #8
-.elseif \x == 4
-    stp             s0, s1, x6
-    stp             s2, s3, x6, #8
+    mov             x6, x5
+    mov             x5, x4
 .endif
-    ret
+    SAD_X_START_\w \x
+    mov             w12, #\h/4
+.loop_sad_x\x\()_\w\()x\h:
+    sub             w12, w12, #1
+ .rept 4
+  .if \w == 24
+    ld1             {v6.16b}, x0, #16
+    ld1             {v7.8b}, x0, x9
+  .elseif \w == 32
+    ld1             {v6.16b-v7.16b}, x0, x9
+  .elseif \w == 48
+    ld1             {v4.16b-v6.16b}, x0, x9
+  .elseif \w == 64
+    ld1             {v4.16b-v7.16b}, x0, x9
+  .endif
+    SAD_X_\w x1, v16, v20
+    SAD_X_\w x2, v17, v21
+    SAD_X_\w x3, v18, v22
+  .if \x == 4
+    SAD_X_\w x4, v19, v23
+  .endif
+ .endr
+    cbnz            w12, .loop_sad_x\x\()_\w\()x\h
+    SAD_X_END_\w \x
 endfunc
 .endm
 
-SAD_X_8xN 3 4
-SAD_X_8xN 3 8
-SAD_X_8xN 3 16
-SAD_X_8xN 3 32

 
@@ -1,7 +1,8 @@
 /*****************************************************************************
- * Copyright (C) 2020 MulticoreWare, Inc
+ * Copyright (C) 2020-2021 MulticoreWare, Inc
  *
  * Authors: Hongbin Liu <liuhongbin1@huawei.com>
+ *          Sebastian Pop <spop@amazon.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -22,84 +23,186 @@
  *****************************************************************************/
 
 #include "asm.S"
+#include "sad-a-common.S"
 
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
 .section .rodata
+#endif
 
 .align 4
 
 .text
 
-.macro SAD_X_START_8 x
-    ld1             {v0.8b}, x0, x9
-.if \x == 3
-    ld1             {v1.8b}, x1, x4
-    ld1             {v2.8b}, x2, x4
-    ld1             {v3.8b}, x3, x4
-.elseif \x == 4
-    ld1             {v1.8b}, x1, x5
-    ld1             {v2.8b}, x2, x5
-    ld1             {v3.8b}, x3, x5
-    ld1             {v4.8b}, x4, x5
-.endif
-    uabdl           v16.8h, v0.8b, v1.8b
-    uabdl           v17.8h, v0.8b, v2.8b
-    uabdl           v18.8h, v0.8b, v3.8b
-.if \x == 4
-    uabdl           v19.8h, v0.8b, v4.8b
+// Fully unrolled.
+.macro SAD_FUNC w, h
+function PFX(pixel_sad_\w\()x\h\()_neon)
+    SAD_START_\w uabdl
+    SAD_\w \h
+.if \w > 4
+    add             v16.8h, v16.8h, v17.8h
 .endif
+    uaddlv          s0, v16.8h
+    fmov            w0, s0
+    ret
+endfunc
+.endm
+
+// Loop unrolled 4.
+.macro SAD_FUNC_LOOP w, h
+function PFX(pixel_sad_\w\()x\h\()_neon)
+    SAD_START_\w
+
+    mov             w9, #\h/8
+.loop_\w\()x\h:
+    sub             w9, w9, #1
+.rept 4
+    SAD_\w
+.endr
+    cbnz            w9, .loop_\w\()x\h
+
+    SAD_END_\w
+endfunc
 .endm
 
-.macro SAD_X_8 x
-    ld1             {v0.8b}, x0, x9
+SAD_FUNC  4,  4
+SAD_FUNC  4,  8
+SAD_FUNC  4,  16
+SAD_FUNC  8,  4
+SAD_FUNC  8,  8
+SAD_FUNC  8,  16
+SAD_FUNC  8,  32
+SAD_FUNC  16, 4
+SAD_FUNC  16, 8
+SAD_FUNC  16, 12
+SAD_FUNC  16, 16
+SAD_FUNC  16, 32
+SAD_FUNC  16, 64
+
+SAD_FUNC_LOOP  32, 8
+SAD_FUNC_LOOP  32, 16
+SAD_FUNC_LOOP  32, 24
+SAD_FUNC_LOOP  32, 32
+SAD_FUNC_LOOP  32, 64
+SAD_FUNC_LOOP  64, 16
+SAD_FUNC_LOOP  64, 32
+SAD_FUNC_LOOP  64, 48
+SAD_FUNC_LOOP  64, 64
+SAD_FUNC_LOOP  12, 16
+SAD_FUNC_LOOP  24, 32
+SAD_FUNC_LOOP  48, 64
+
+// SAD_X3 and SAD_X4 code start
+
+// static void x264_pixel_sad_x3_##size(pixel *fenc, pixel *pix0, pixel *pix1, pixel *pix2, intptr_t i_stride, int scores3)
+// static void x264_pixel_sad_x4_##size(pixel *fenc, pixel *pix0, pixel *pix1,pixel *pix2, pixel *pix3, intptr_t i_stride, int scores4)
+.macro SAD_X_FUNC x, w, h
+function PFX(sad_x\x\()_\w\()x\h\()_neon)
+    mov             x9, #FENC_STRIDE
+
+// Make function arguments for x == 3 look like x == 4.
 .if \x == 3
-    ld1             {v1.8b}, x1, x4
-    ld1             {v2.8b}, x2, x4
-    ld1             {v3.8b}, x3, x4
-.elseif \x == 4
-    ld1             {v1.8b}, x1, x5
-    ld1             {v2.8b}, x2, x5
-    ld1             {v3.8b}, x3, x5
-    ld1             {v4.8b}, x4, x5
+    mov             x6, x5
+    mov             x5, x4
 .endif
-    uabal           v16.8h, v0.8b, v1.8b
-    uabal           v17.8h, v0.8b, v2.8b
-    uabal           v18.8h, v0.8b, v3.8b
-.if \x == 4
-    uabal           v19.8h, v0.8b, v4.8b
+
+.if \w == 12
+    movrel          x12, sad12_mask
+    ld1             {v31.16b}, x12
 .endif
+
+    SAD_X_START_\w \h, \x, uabdl
+    SAD_X_\w \h, \x
+    SAD_X_END_\w \x
+endfunc
 .endm
 
-.macro SAD_X_8xN x, h
-function x265_sad_x\x\()_8x\h\()_neon
+.macro SAD_X_LOOP x, w, h
+function PFX(sad_x\x\()_\w\()x\h\()_neon)
     mov             x9, #FENC_STRIDE
-    SAD_X_START_8 \x
-.rept \h - 1
-    SAD_X_8 \x
-.endr
-    uaddlv          s0, v16.8h
-    uaddlv          s1, v17.8h
-    uaddlv          s2, v18.8h
-.if \x == 4
-    uaddlv          s3, v19.8h
-.endif
 
+// Make function arguments for x == 3 look like x == 4.
 .if \x == 3
-    stp             s0, s1, x5
-    str             s2, x5, #8
-.elseif \x == 4
-    stp             s0, s1, x6
-    stp             s2, s3, x6, #8
+    mov             x6, x5
+    mov             x5, x4
 .endif
-    ret
+    SAD_X_START_\w \x
+    mov             w12, #\h/4
+.loop_sad_x\x\()_\w\()x\h:
+    sub             w12, w12, #1
+ .rept 4
+  .if \w == 24
+    ld1             {v6.16b}, x0, #16
+    ld1             {v7.8b}, x0, x9
+  .elseif \w == 32
+    ld1             {v6.16b-v7.16b}, x0, x9
+  .elseif \w == 48
+    ld1             {v4.16b-v6.16b}, x0, x9
+  .elseif \w == 64
+    ld1             {v4.16b-v7.16b}, x0, x9
+  .endif
+    SAD_X_\w x1, v16, v20
+    SAD_X_\w x2, v17, v21
+    SAD_X_\w x3, v18, v22
+  .if \x == 4
+    SAD_X_\w x4, v19, v23
+  .endif
+ .endr
+    cbnz            w12, .loop_sad_x\x\()_\w\()x\h
+    SAD_X_END_\w \x
 endfunc
 .endm
 
-SAD_X_8xN 3 4
-SAD_X_8xN 3 8
-SAD_X_8xN 3 16
-SAD_X_8xN 3 32
 
​

x265_3.6.tar.gz/source/common/aarch64/ssd-a-common.S Added

@@ -0,0 +1,37 @@
+/*****************************************************************************
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
+ *
+ * Authors: David Chen <david.chen@myais.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+// This file contains the macros written using NEON instruction set
+// that are also used by the SVE2 functions
+
+#include "asm.S"
+
+.arch           armv8-a
+
+.macro ret_v0_w0
+    trn2            v1.2d, v0.2d, v0.2d
+    add             v0.2s, v0.2s, v1.2s
+    addp            v0.2s, v0.2s, v0.2s
+    fmov            w0, s0
+    ret
+.endm

 
@@ -0,0 +1,37 @@
+/*****************************************************************************
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
+ *
+ * Authors: David Chen <david.chen@myais.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+// This file contains the macros written using NEON instruction set
+// that are also used by the SVE2 functions
+
+#include "asm.S"
+
+.arch           armv8-a
+
+.macro ret_v0_w0
+    trn2            v1.2d, v0.2d, v0.2d
+    add             v0.2s, v0.2s, v1.2s
+    addp            v0.2s, v0.2s, v0.2s
+    fmov            w0, s0
+    ret
+.endm
​

x265_3.6.tar.gz/source/common/aarch64/ssd-a-sve.S Added

@@ -0,0 +1,78 @@
+/*****************************************************************************
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
+ *
+ * Authors: David Chen <david.chen@myais.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm-sve.S"
+
+.arch armv8-a+sve
+
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
+.section .rodata
+#endif
+
+.align 4
+
+.text
+
+function PFX(pixel_sse_pp_4x4_sve)
+    ptrue           p0.s, vl4
+    ld1b            {z0.s}, p0/z, x0
+    ld1b            {z17.s}, p0/z, x2
+    add             x0, x0, x1
+    add             x2, x2, x3
+    sub             z0.s, p0/m, z0.s, z17.s
+    mul             z0.s, p0/m, z0.s, z0.s
+.rept 3
+    ld1b            {z16.s}, p0/z, x0
+    ld1b            {z17.s}, p0/z, x2
+    add             x0, x0, x1
+    add             x2, x2, x3
+    sub             z16.s, p0/m, z16.s, z17.s
+    mla             z0.s, p0/m, z16.s, z16.s
+.endr
+    uaddv           d0, p0, z0.s
+    fmov            w0, s0
+    ret
+endfunc
+
+function PFX(pixel_sse_pp_4x8_sve)
+    ptrue           p0.s, vl4
+    ld1b            {z0.s}, p0/z, x0
+    ld1b            {z17.s}, p0/z, x2
+    add             x0, x0, x1
+    add             x2, x2, x3
+    sub             z0.s, p0/m, z0.s, z17.s
+    mul             z0.s, p0/m, z0.s, z0.s
+.rept 7
+    ld1b            {z16.s}, p0/z, x0
+    ld1b            {z17.s}, p0/z, x2
+    add             x0, x0, x1
+    add             x2, x2, x3
+    sub             z16.s, p0/m, z16.s, z17.s
+    mla             z0.s, p0/m, z16.s, z16.s
+.endr
+    uaddv           d0, p0, z0.s
+    fmov            w0, s0
+    ret
+endfunc

 
@@ -0,0 +1,78 @@
+/*****************************************************************************
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
+ *
+ * Authors: David Chen <david.chen@myais.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm-sve.S"
+
+.arch armv8-a+sve
+
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
+.section .rodata
+#endif
+
+.align 4
+
+.text
+
+function PFX(pixel_sse_pp_4x4_sve)
+    ptrue           p0.s, vl4
+    ld1b            {z0.s}, p0/z, x0
+    ld1b            {z17.s}, p0/z, x2
+    add             x0, x0, x1
+    add             x2, x2, x3
+    sub             z0.s, p0/m, z0.s, z17.s
+    mul             z0.s, p0/m, z0.s, z0.s
+.rept 3
+    ld1b            {z16.s}, p0/z, x0
+    ld1b            {z17.s}, p0/z, x2
+    add             x0, x0, x1
+    add             x2, x2, x3
+    sub             z16.s, p0/m, z16.s, z17.s
+    mla             z0.s, p0/m, z16.s, z16.s
+.endr
+    uaddv           d0, p0, z0.s
+    fmov            w0, s0
+    ret
+endfunc
+
+function PFX(pixel_sse_pp_4x8_sve)
+    ptrue           p0.s, vl4
+    ld1b            {z0.s}, p0/z, x0
+    ld1b            {z17.s}, p0/z, x2
+    add             x0, x0, x1
+    add             x2, x2, x3
+    sub             z0.s, p0/m, z0.s, z17.s
+    mul             z0.s, p0/m, z0.s, z0.s
+.rept 7
+    ld1b            {z16.s}, p0/z, x0
+    ld1b            {z17.s}, p0/z, x2
+    add             x0, x0, x1
+    add             x2, x2, x3
+    sub             z16.s, p0/m, z16.s, z17.s
+    mla             z0.s, p0/m, z16.s, z16.s
+.endr
+    uaddv           d0, p0, z0.s
+    fmov            w0, s0
+    ret
+endfunc
​

x265_3.6.tar.gz/source/common/aarch64/ssd-a-sve2.S Added

@@ -0,0 +1,887 @@
+/*****************************************************************************
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
+ *
+ * Authors: David Chen <david.chen@myais.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm-sve.S"
+#include "ssd-a-common.S"
+
+.arch armv8-a+sve2
+
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
+.section .rodata
+#endif
+
+.align 4
+
+.text
+
+function PFX(pixel_sse_pp_32x32_sve2)
+    rdvl            x9, #1
+    cmp             x9, #16
+    bgt             .vl_gt_16_pixel_sse_pp_32x32
+    mov             w12, #8
+    movi            v0.16b, #0
+    movi            v1.16b, #0
+.loop_sse_pp_32_sve2:
+    sub             w12, w12, #1
+.rept 4
+    ld1             {v16.16b,v17.16b}, x0, x1
+    ld1             {v18.16b,v19.16b}, x2, x3
+    usubl           v2.8h, v16.8b, v18.8b
+    usubl2          v3.8h, v16.16b, v18.16b
+    usubl           v4.8h, v17.8b, v19.8b
+    usubl2          v5.8h, v17.16b, v19.16b
+    smlal           v0.4s, v2.4h, v2.4h
+    smlal2          v1.4s, v2.8h, v2.8h
+    smlal           v0.4s, v3.4h, v3.4h
+    smlal2          v1.4s, v3.8h, v3.8h
+    smlal           v0.4s, v4.4h, v4.4h
+    smlal2          v1.4s, v4.8h, v4.8h
+    smlal           v0.4s, v5.4h, v5.4h
+    smlal2          v1.4s, v5.8h, v5.8h
+.endr
+    cbnz            w12, .loop_sse_pp_32_sve2
+    add             v0.4s, v0.4s, v1.4s
+    ret_v0_w0
+.vl_gt_16_pixel_sse_pp_32x32:
+    ptrue           p0.b, vl32
+    ld1b            {z16.b}, p0/z, x0
+    ld1b            {z18.b}, p0/z, x2
+    add             x0, x0, x1
+    add             x2, x2, x3
+    usublb          z1.h, z16.b, z18.b
+    usublt          z2.h, z16.b, z18.b
+    smullb          z0.s, z1.h, z1.h
+    smlalt          z0.s, z1.h, z1.h
+    smlalb          z0.s, z2.h, z2.h
+    smlalt          z0.s, z2.h, z2.h
+.rept 31
+    ld1b            {z16.b}, p0/z, x0
+    ld1b            {z18.b}, p0/z, x2
+    add             x0, x0, x1
+    add             x2, x2, x3
+    usublb          z1.h, z16.b, z18.b
+    usublt          z2.h, z16.b, z18.b
+    smullb          z0.s, z1.h, z1.h
+    smlalt          z0.s, z1.h, z1.h
+    smlalb          z0.s, z2.h, z2.h
+    smlalt          z0.s, z2.h, z2.h
+.endr
+    uaddv           d3, p0, z0.s
+    fmov            w0, s3
+    ret
+endfunc
+
+function PFX(pixel_sse_pp_32x64_sve2)
+    rdvl            x9, #1
+    cmp             x9, #16
+    bgt             .vl_gt_16_pixel_sse_pp_32x64
+    ptrue           p0.b, vl16
+    ld1b            {z16.b}, p0/z, x0
+    ld1b            {z17.b}, p0/z, x0, #1, mul vl
+    ld1b            {z18.b}, p0/z, x2
+    ld1b            {z19.b}, p0/z, x2, #1, mul vl
+    add             x0, x0, x1
+    add             x2, x2, x3
+    usublb          z1.h, z16.b, z18.b
+    usublt          z2.h, z16.b, z18.b
+    usublb          z3.h, z17.b, z19.b
+    usublt          z4.h, z17.b, z19.b
+    smullb          z20.s, z1.h, z1.h
+    smullt          z21.s, z1.h, z1.h
+    smlalb          z20.s, z2.h, z2.h
+    smlalt          z21.s, z2.h, z2.h
+    smlalb          z20.s, z3.h, z3.h
+    smlalt          z21.s, z3.h, z3.h
+    smlalb          z20.s, z4.h, z4.h
+    smlalt          z21.s, z4.h, z4.h
+.rept 63
+    ld1b            {z16.b}, p0/z, x0
+    ld1b            {z17.b}, p0/z, x0, #1, mul vl
+    ld1b            {z18.b}, p0/z, x2
+    ld1b            {z19.b}, p0/z, x2, #1, mul vl
+    add             x0, x0, x1
+    add             x2, x2, x3
+    usublb          z1.h, z16.b, z18.b
+    usublt          z2.h, z16.b, z18.b
+    usublb          z3.h, z17.b, z19.b
+    usublt          z4.h, z17.b, z19.b
+    smlalb          z20.s, z1.h, z1.h
+    smlalt          z21.s, z1.h, z1.h
+    smlalb          z20.s, z2.h, z2.h
+    smlalt          z21.s, z2.h, z2.h
+    smlalb          z20.s, z3.h, z3.h
+    smlalt          z21.s, z3.h, z3.h
+    smlalb          z20.s, z4.h, z4.h
+    smlalt          z21.s, z4.h, z4.h
+.endr
+    uaddv           d3, p0, z20.s
+    fmov            w0, s3
+    uaddv           d4, p0, z21.s
+    fmov            w1, s4
+    add             w0, w0, w1
+    ret
+.vl_gt_16_pixel_sse_pp_32x64:
+    ptrue           p0.b, vl32
+    ld1b            {z16.b}, p0/z, x0
+    ld1b            {z18.b}, p0/z, x2
+    add             x0, x0, x1
+    add             x2, x2, x3
+    usublb          z1.h, z16.b, z18.b
+    usublt          z2.h, z16.b, z18.b
+    smullb          z20.s, z1.h, z1.h
+    smullt          z21.s, z1.h, z1.h
+    smlalb          z20.s, z2.h, z2.h
+    smlalt          z21.s, z2.h, z2.h
+.rept 63
+    ld1b            {z16.b}, p0/z, x0
+    ld1b            {z18.b}, p0/z, x2
+    add             x0, x0, x1
+    add             x2, x2, x3
+    usublb          z1.h, z16.b, z18.b
+    usublt          z2.h, z16.b, z18.b
+    smlalb          z20.s, z1.h, z1.h
+    smlalt          z21.s, z1.h, z1.h
+    smlalb          z20.s, z2.h, z2.h
+    smlalt          z21.s, z2.h, z2.h
+.endr
+    uaddv           d3, p0, z20.s
+    fmov            w0, s3
+    uaddv           d4, p0, z21.s
+    fmov            w1, s4
+    add             w0, w0, w1
+    ret
+endfunc
+
+function PFX(pixel_sse_pp_64x64_sve2)
+    rdvl            x9, #1
+    cmp             x9, #16
+    bgt             .vl_gt_16_pixel_sse_pp_64x64
+    mov             w12, #16
+    movi            v0.16b, #0
+    movi            v1.16b, #0
+
+.loop_sse_pp_64_sve2:
+    sub             w12, w12, #1
+.rept 4
+    ld1             {v16.16b-v19.16b}, x0, x1
+    ld1             {v20.16b-v23.16b}, x2, x3
+
+    usubl           v2.8h, v16.8b, v20.8b
+    usubl2          v3.8h, v16.16b, v20.16b
+    usubl           v4.8h, v17.8b, v21.8b
+    usubl2          v5.8h, v17.16b, v21.16b
+    smlal           v0.4s, v2.4h, v2.4h
+    smlal2          v1.4s, v2.8h, v2.8h
+    smlal           v0.4s, v3.4h, v3.4h
+    smlal2          v1.4s, v3.8h, v3.8h
+    smlal           v0.4s, v4.4h, v4.4h

 
@@ -0,0 +1,887 @@
+/*****************************************************************************
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
+ *
+ * Authors: David Chen <david.chen@myais.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm-sve.S"
+#include "ssd-a-common.S"
+
+.arch armv8-a+sve2
+
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
+.section .rodata
+#endif
+
+.align 4
+
+.text
+
+function PFX(pixel_sse_pp_32x32_sve2)
+    rdvl            x9, #1
+    cmp             x9, #16
+    bgt             .vl_gt_16_pixel_sse_pp_32x32
+    mov             w12, #8
+    movi            v0.16b, #0
+    movi            v1.16b, #0
+.loop_sse_pp_32_sve2:
+    sub             w12, w12, #1
+.rept 4
+    ld1             {v16.16b,v17.16b}, x0, x1
+    ld1             {v18.16b,v19.16b}, x2, x3
+    usubl           v2.8h, v16.8b, v18.8b
+    usubl2          v3.8h, v16.16b, v18.16b
+    usubl           v4.8h, v17.8b, v19.8b
+    usubl2          v5.8h, v17.16b, v19.16b
+    smlal           v0.4s, v2.4h, v2.4h
+    smlal2          v1.4s, v2.8h, v2.8h
+    smlal           v0.4s, v3.4h, v3.4h
+    smlal2          v1.4s, v3.8h, v3.8h
+    smlal           v0.4s, v4.4h, v4.4h
+    smlal2          v1.4s, v4.8h, v4.8h
+    smlal           v0.4s, v5.4h, v5.4h
+    smlal2          v1.4s, v5.8h, v5.8h
+.endr
+    cbnz            w12, .loop_sse_pp_32_sve2
+    add             v0.4s, v0.4s, v1.4s
+    ret_v0_w0
+.vl_gt_16_pixel_sse_pp_32x32:
+    ptrue           p0.b, vl32
+    ld1b            {z16.b}, p0/z, x0
+    ld1b            {z18.b}, p0/z, x2
+    add             x0, x0, x1
+    add             x2, x2, x3
+    usublb          z1.h, z16.b, z18.b
+    usublt          z2.h, z16.b, z18.b
+    smullb          z0.s, z1.h, z1.h
+    smlalt          z0.s, z1.h, z1.h
+    smlalb          z0.s, z2.h, z2.h
+    smlalt          z0.s, z2.h, z2.h
+.rept 31
+    ld1b            {z16.b}, p0/z, x0
+    ld1b            {z18.b}, p0/z, x2
+    add             x0, x0, x1
+    add             x2, x2, x3
+    usublb          z1.h, z16.b, z18.b
+    usublt          z2.h, z16.b, z18.b
+    smullb          z0.s, z1.h, z1.h
+    smlalt          z0.s, z1.h, z1.h
+    smlalb          z0.s, z2.h, z2.h
+    smlalt          z0.s, z2.h, z2.h
+.endr
+    uaddv           d3, p0, z0.s
+    fmov            w0, s3
+    ret
+endfunc
+
+function PFX(pixel_sse_pp_32x64_sve2)
+    rdvl            x9, #1
+    cmp             x9, #16
+    bgt             .vl_gt_16_pixel_sse_pp_32x64
+    ptrue           p0.b, vl16
+    ld1b            {z16.b}, p0/z, x0
+    ld1b            {z17.b}, p0/z, x0, #1, mul vl
+    ld1b            {z18.b}, p0/z, x2
+    ld1b            {z19.b}, p0/z, x2, #1, mul vl
+    add             x0, x0, x1
+    add             x2, x2, x3
+    usublb          z1.h, z16.b, z18.b
+    usublt          z2.h, z16.b, z18.b
+    usublb          z3.h, z17.b, z19.b
+    usublt          z4.h, z17.b, z19.b
+    smullb          z20.s, z1.h, z1.h
+    smullt          z21.s, z1.h, z1.h
+    smlalb          z20.s, z2.h, z2.h
+    smlalt          z21.s, z2.h, z2.h
+    smlalb          z20.s, z3.h, z3.h
+    smlalt          z21.s, z3.h, z3.h
+    smlalb          z20.s, z4.h, z4.h
+    smlalt          z21.s, z4.h, z4.h
+.rept 63
+    ld1b            {z16.b}, p0/z, x0
+    ld1b            {z17.b}, p0/z, x0, #1, mul vl
+    ld1b            {z18.b}, p0/z, x2
+    ld1b            {z19.b}, p0/z, x2, #1, mul vl
+    add             x0, x0, x1
+    add             x2, x2, x3
+    usublb          z1.h, z16.b, z18.b
+    usublt          z2.h, z16.b, z18.b
+    usublb          z3.h, z17.b, z19.b
+    usublt          z4.h, z17.b, z19.b
+    smlalb          z20.s, z1.h, z1.h
+    smlalt          z21.s, z1.h, z1.h
+    smlalb          z20.s, z2.h, z2.h
+    smlalt          z21.s, z2.h, z2.h
+    smlalb          z20.s, z3.h, z3.h
+    smlalt          z21.s, z3.h, z3.h
+    smlalb          z20.s, z4.h, z4.h
+    smlalt          z21.s, z4.h, z4.h
+.endr
+    uaddv           d3, p0, z20.s
+    fmov            w0, s3
+    uaddv           d4, p0, z21.s
+    fmov            w1, s4
+    add             w0, w0, w1
+    ret
+.vl_gt_16_pixel_sse_pp_32x64:
+    ptrue           p0.b, vl32
+    ld1b            {z16.b}, p0/z, x0
+    ld1b            {z18.b}, p0/z, x2
+    add             x0, x0, x1
+    add             x2, x2, x3
+    usublb          z1.h, z16.b, z18.b
+    usublt          z2.h, z16.b, z18.b
+    smullb          z20.s, z1.h, z1.h
+    smullt          z21.s, z1.h, z1.h
+    smlalb          z20.s, z2.h, z2.h
+    smlalt          z21.s, z2.h, z2.h
+.rept 63
+    ld1b            {z16.b}, p0/z, x0
+    ld1b            {z18.b}, p0/z, x2
+    add             x0, x0, x1
+    add             x2, x2, x3
+    usublb          z1.h, z16.b, z18.b
+    usublt          z2.h, z16.b, z18.b
+    smlalb          z20.s, z1.h, z1.h
+    smlalt          z21.s, z1.h, z1.h
+    smlalb          z20.s, z2.h, z2.h
+    smlalt          z21.s, z2.h, z2.h
+.endr
+    uaddv           d3, p0, z20.s
+    fmov            w0, s3
+    uaddv           d4, p0, z21.s
+    fmov            w1, s4
+    add             w0, w0, w1
+    ret
+endfunc
+
+function PFX(pixel_sse_pp_64x64_sve2)
+    rdvl            x9, #1
+    cmp             x9, #16
+    bgt             .vl_gt_16_pixel_sse_pp_64x64
+    mov             w12, #16
+    movi            v0.16b, #0
+    movi            v1.16b, #0
+
+.loop_sse_pp_64_sve2:
+    sub             w12, w12, #1
+.rept 4
+    ld1             {v16.16b-v19.16b}, x0, x1
+    ld1             {v20.16b-v23.16b}, x2, x3
+
+    usubl           v2.8h, v16.8b, v20.8b
+    usubl2          v3.8h, v16.16b, v20.16b
+    usubl           v4.8h, v17.8b, v21.8b
+    usubl2          v5.8h, v17.16b, v21.16b
+    smlal           v0.4s, v2.4h, v2.4h
+    smlal2          v1.4s, v2.8h, v2.8h
+    smlal           v0.4s, v3.4h, v3.4h
+    smlal2          v1.4s, v3.8h, v3.8h
+    smlal           v0.4s, v4.4h, v4.4h
​

x265_3.6.tar.gz/source/common/aarch64/ssd-a.S Added

@@ -0,0 +1,476 @@
+/*****************************************************************************
+ * Copyright (C) 2021 MulticoreWare, Inc
+ *
+ * Authors: Sebastian Pop <spop@amazon.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm.S"
+#include "ssd-a-common.S"
+
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
+.section .rodata
+#endif
+
+.align 4
+
+.text
+
+function PFX(pixel_sse_pp_4x4_neon)
+    ld1             {v16.s}0, x0, x1
+    ld1             {v17.s}0, x2, x3
+    ld1             {v18.s}0, x0, x1
+    ld1             {v19.s}0, x2, x3
+    ld1             {v20.s}0, x0, x1
+    ld1             {v21.s}0, x2, x3
+    ld1             {v22.s}0, x0, x1
+    ld1             {v23.s}0, x2, x3
+
+    usubl           v1.8h, v16.8b, v17.8b
+    usubl           v2.8h, v18.8b, v19.8b
+    usubl           v3.8h, v20.8b, v21.8b
+    usubl           v4.8h, v22.8b, v23.8b
+
+    smull           v0.4s, v1.4h, v1.4h
+    smlal           v0.4s, v2.4h, v2.4h
+    smlal           v0.4s, v3.4h, v3.4h
+    smlal           v0.4s, v4.4h, v4.4h
+    ret_v0_w0
+endfunc
+
+function PFX(pixel_sse_pp_4x8_neon)
+    ld1             {v16.s}0, x0, x1
+    ld1             {v17.s}0, x2, x3
+    usubl           v1.8h, v16.8b, v17.8b
+    ld1             {v16.s}0, x0, x1
+    ld1             {v17.s}0, x2, x3
+    smull           v0.4s, v1.4h, v1.4h
+.rept 6
+    usubl           v1.8h, v16.8b, v17.8b
+    ld1             {v16.s}0, x0, x1
+    smlal           v0.4s, v1.4h, v1.4h
+    ld1             {v17.s}0, x2, x3
+.endr
+    usubl           v1.8h, v16.8b, v17.8b
+    smlal           v0.4s, v1.4h, v1.4h
+    ret_v0_w0
+endfunc
+
+function PFX(pixel_sse_pp_8x8_neon)
+    ld1             {v16.8b}, x0, x1
+    ld1             {v17.8b}, x2, x3
+    usubl           v1.8h, v16.8b, v17.8b
+    ld1             {v16.8b}, x0, x1
+    smull           v0.4s, v1.4h, v1.4h
+    smlal2          v0.4s, v1.8h, v1.8h
+    ld1             {v17.8b}, x2, x3
+
+.rept 6
+    usubl           v1.8h, v16.8b, v17.8b
+    ld1             {v16.8b}, x0, x1
+    smlal           v0.4s, v1.4h, v1.4h
+    smlal2          v0.4s, v1.8h, v1.8h
+    ld1             {v17.8b}, x2, x3
+.endr
+    usubl           v1.8h, v16.8b, v17.8b
+    smlal           v0.4s, v1.4h, v1.4h
+    smlal2          v0.4s, v1.8h, v1.8h
+    ret_v0_w0
+endfunc
+
+function PFX(pixel_sse_pp_8x16_neon)
+    ld1             {v16.8b}, x0, x1
+    ld1             {v17.8b}, x2, x3
+    usubl           v1.8h, v16.8b, v17.8b
+    ld1             {v16.8b}, x0, x1
+    smull           v0.4s, v1.4h, v1.4h
+    smlal2          v0.4s, v1.8h, v1.8h
+    ld1             {v17.8b}, x2, x3
+
+.rept 14
+    usubl           v1.8h, v16.8b, v17.8b
+    ld1             {v16.8b}, x0, x1
+    smlal           v0.4s, v1.4h, v1.4h
+    smlal2          v0.4s, v1.8h, v1.8h
+    ld1             {v17.8b}, x2, x3
+.endr
+    usubl           v1.8h, v16.8b, v17.8b
+    smlal           v0.4s, v1.4h, v1.4h
+    smlal2          v0.4s, v1.8h, v1.8h
+    ret_v0_w0
+endfunc
+
+.macro sse_pp_16xN h
+function PFX(pixel_sse_pp_16x\h\()_neon)
+    ld1             {v16.16b}, x0, x1
+    ld1             {v17.16b}, x2, x3
+    usubl           v1.8h, v16.8b, v17.8b
+    usubl2          v2.8h, v16.16b, v17.16b
+    ld1             {v16.16b}, x0, x1
+    ld1             {v17.16b}, x2, x3
+    smull           v0.4s, v1.4h, v1.4h
+    smlal2          v0.4s, v1.8h, v1.8h
+    smlal           v0.4s, v2.4h, v2.4h
+    smlal2          v0.4s, v2.8h, v2.8h
+.rept \h - 2
+    usubl           v1.8h, v16.8b, v17.8b
+    usubl2          v2.8h, v16.16b, v17.16b
+    ld1             {v16.16b}, x0, x1
+    smlal           v0.4s, v1.4h, v1.4h
+    smlal2          v0.4s, v1.8h, v1.8h
+    ld1             {v17.16b}, x2, x3
+    smlal           v0.4s, v2.4h, v2.4h
+    smlal2          v0.4s, v2.8h, v2.8h
+.endr
+    usubl           v1.8h, v16.8b, v17.8b
+    usubl2          v2.8h, v16.16b, v17.16b
+    smlal           v0.4s, v1.4h, v1.4h
+    smlal2          v0.4s, v1.8h, v1.8h
+    smlal           v0.4s, v2.4h, v2.4h
+    smlal2          v0.4s, v2.8h, v2.8h
+    ret_v0_w0
+endfunc
+.endm
+
+sse_pp_16xN 16
+sse_pp_16xN 32
+
+function PFX(pixel_sse_pp_32x32_neon)
+    mov             w12, #8
+    movi            v0.16b, #0
+    movi            v1.16b, #0
+.loop_sse_pp_32:
+    sub             w12, w12, #1
+.rept 4
+    ld1             {v16.16b,v17.16b}, x0, x1
+    ld1             {v18.16b,v19.16b}, x2, x3
+    usubl           v2.8h, v16.8b, v18.8b
+    usubl2          v3.8h, v16.16b, v18.16b
+    usubl           v4.8h, v17.8b, v19.8b
+    usubl2          v5.8h, v17.16b, v19.16b
+    smlal           v0.4s, v2.4h, v2.4h
+    smlal2          v1.4s, v2.8h, v2.8h
+    smlal           v0.4s, v3.4h, v3.4h
+    smlal2          v1.4s, v3.8h, v3.8h
+    smlal           v0.4s, v4.4h, v4.4h
+    smlal2          v1.4s, v4.8h, v4.8h
+    smlal           v0.4s, v5.4h, v5.4h
+    smlal2          v1.4s, v5.8h, v5.8h
+.endr
+    cbnz            w12, .loop_sse_pp_32
+    add             v0.4s, v0.4s, v1.4s
+    ret_v0_w0
+endfunc
+
+function PFX(pixel_sse_pp_32x64_neon)
+    mov             w12, #16
+    movi            v0.16b, #0
+    movi            v1.16b, #0
+.loop_sse_pp_32x64:
+    sub             w12, w12, #1
+.rept 4
+    ld1             {v16.16b,v17.16b}, x0, x1
+    ld1             {v18.16b,v19.16b}, x2, x3
+    usubl           v2.8h, v16.8b, v18.8b
+    usubl2          v3.8h, v16.16b, v18.16b
+    usubl           v4.8h, v17.8b, v19.8b
+    usubl2          v5.8h, v17.16b, v19.16b
+    smlal           v0.4s, v2.4h, v2.4h
+    smlal2          v1.4s, v2.8h, v2.8h
+    smlal           v0.4s, v3.4h, v3.4h
+    smlal2          v1.4s, v3.8h, v3.8h

 
@@ -0,0 +1,476 @@
+/*****************************************************************************
+ * Copyright (C) 2021 MulticoreWare, Inc
+ *
+ * Authors: Sebastian Pop <spop@amazon.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm.S"
+#include "ssd-a-common.S"
+
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
+.section .rodata
+#endif
+
+.align 4
+
+.text
+
+function PFX(pixel_sse_pp_4x4_neon)
+    ld1             {v16.s}0, x0, x1
+    ld1             {v17.s}0, x2, x3
+    ld1             {v18.s}0, x0, x1
+    ld1             {v19.s}0, x2, x3
+    ld1             {v20.s}0, x0, x1
+    ld1             {v21.s}0, x2, x3
+    ld1             {v22.s}0, x0, x1
+    ld1             {v23.s}0, x2, x3
+
+    usubl           v1.8h, v16.8b, v17.8b
+    usubl           v2.8h, v18.8b, v19.8b
+    usubl           v3.8h, v20.8b, v21.8b
+    usubl           v4.8h, v22.8b, v23.8b
+
+    smull           v0.4s, v1.4h, v1.4h
+    smlal           v0.4s, v2.4h, v2.4h
+    smlal           v0.4s, v3.4h, v3.4h
+    smlal           v0.4s, v4.4h, v4.4h
+    ret_v0_w0
+endfunc
+
+function PFX(pixel_sse_pp_4x8_neon)
+    ld1             {v16.s}0, x0, x1
+    ld1             {v17.s}0, x2, x3
+    usubl           v1.8h, v16.8b, v17.8b
+    ld1             {v16.s}0, x0, x1
+    ld1             {v17.s}0, x2, x3
+    smull           v0.4s, v1.4h, v1.4h
+.rept 6
+    usubl           v1.8h, v16.8b, v17.8b
+    ld1             {v16.s}0, x0, x1
+    smlal           v0.4s, v1.4h, v1.4h
+    ld1             {v17.s}0, x2, x3
+.endr
+    usubl           v1.8h, v16.8b, v17.8b
+    smlal           v0.4s, v1.4h, v1.4h
+    ret_v0_w0
+endfunc
+
+function PFX(pixel_sse_pp_8x8_neon)
+    ld1             {v16.8b}, x0, x1
+    ld1             {v17.8b}, x2, x3
+    usubl           v1.8h, v16.8b, v17.8b
+    ld1             {v16.8b}, x0, x1
+    smull           v0.4s, v1.4h, v1.4h
+    smlal2          v0.4s, v1.8h, v1.8h
+    ld1             {v17.8b}, x2, x3
+
+.rept 6
+    usubl           v1.8h, v16.8b, v17.8b
+    ld1             {v16.8b}, x0, x1
+    smlal           v0.4s, v1.4h, v1.4h
+    smlal2          v0.4s, v1.8h, v1.8h
+    ld1             {v17.8b}, x2, x3
+.endr
+    usubl           v1.8h, v16.8b, v17.8b
+    smlal           v0.4s, v1.4h, v1.4h
+    smlal2          v0.4s, v1.8h, v1.8h
+    ret_v0_w0
+endfunc
+
+function PFX(pixel_sse_pp_8x16_neon)
+    ld1             {v16.8b}, x0, x1
+    ld1             {v17.8b}, x2, x3
+    usubl           v1.8h, v16.8b, v17.8b
+    ld1             {v16.8b}, x0, x1
+    smull           v0.4s, v1.4h, v1.4h
+    smlal2          v0.4s, v1.8h, v1.8h
+    ld1             {v17.8b}, x2, x3
+
+.rept 14
+    usubl           v1.8h, v16.8b, v17.8b
+    ld1             {v16.8b}, x0, x1
+    smlal           v0.4s, v1.4h, v1.4h
+    smlal2          v0.4s, v1.8h, v1.8h
+    ld1             {v17.8b}, x2, x3
+.endr
+    usubl           v1.8h, v16.8b, v17.8b
+    smlal           v0.4s, v1.4h, v1.4h
+    smlal2          v0.4s, v1.8h, v1.8h
+    ret_v0_w0
+endfunc
+
+.macro sse_pp_16xN h
+function PFX(pixel_sse_pp_16x\h\()_neon)
+    ld1             {v16.16b}, x0, x1
+    ld1             {v17.16b}, x2, x3
+    usubl           v1.8h, v16.8b, v17.8b
+    usubl2          v2.8h, v16.16b, v17.16b
+    ld1             {v16.16b}, x0, x1
+    ld1             {v17.16b}, x2, x3
+    smull           v0.4s, v1.4h, v1.4h
+    smlal2          v0.4s, v1.8h, v1.8h
+    smlal           v0.4s, v2.4h, v2.4h
+    smlal2          v0.4s, v2.8h, v2.8h
+.rept \h - 2
+    usubl           v1.8h, v16.8b, v17.8b
+    usubl2          v2.8h, v16.16b, v17.16b
+    ld1             {v16.16b}, x0, x1
+    smlal           v0.4s, v1.4h, v1.4h
+    smlal2          v0.4s, v1.8h, v1.8h
+    ld1             {v17.16b}, x2, x3
+    smlal           v0.4s, v2.4h, v2.4h
+    smlal2          v0.4s, v2.8h, v2.8h
+.endr
+    usubl           v1.8h, v16.8b, v17.8b
+    usubl2          v2.8h, v16.16b, v17.16b
+    smlal           v0.4s, v1.4h, v1.4h
+    smlal2          v0.4s, v1.8h, v1.8h
+    smlal           v0.4s, v2.4h, v2.4h
+    smlal2          v0.4s, v2.8h, v2.8h
+    ret_v0_w0
+endfunc
+.endm
+
+sse_pp_16xN 16
+sse_pp_16xN 32
+
+function PFX(pixel_sse_pp_32x32_neon)
+    mov             w12, #8
+    movi            v0.16b, #0
+    movi            v1.16b, #0
+.loop_sse_pp_32:
+    sub             w12, w12, #1
+.rept 4
+    ld1             {v16.16b,v17.16b}, x0, x1
+    ld1             {v18.16b,v19.16b}, x2, x3
+    usubl           v2.8h, v16.8b, v18.8b
+    usubl2          v3.8h, v16.16b, v18.16b
+    usubl           v4.8h, v17.8b, v19.8b
+    usubl2          v5.8h, v17.16b, v19.16b
+    smlal           v0.4s, v2.4h, v2.4h
+    smlal2          v1.4s, v2.8h, v2.8h
+    smlal           v0.4s, v3.4h, v3.4h
+    smlal2          v1.4s, v3.8h, v3.8h
+    smlal           v0.4s, v4.4h, v4.4h
+    smlal2          v1.4s, v4.8h, v4.8h
+    smlal           v0.4s, v5.4h, v5.4h
+    smlal2          v1.4s, v5.8h, v5.8h
+.endr
+    cbnz            w12, .loop_sse_pp_32
+    add             v0.4s, v0.4s, v1.4s
+    ret_v0_w0
+endfunc
+
+function PFX(pixel_sse_pp_32x64_neon)
+    mov             w12, #16
+    movi            v0.16b, #0
+    movi            v1.16b, #0
+.loop_sse_pp_32x64:
+    sub             w12, w12, #1
+.rept 4
+    ld1             {v16.16b,v17.16b}, x0, x1
+    ld1             {v18.16b,v19.16b}, x2, x3
+    usubl           v2.8h, v16.8b, v18.8b
+    usubl2          v3.8h, v16.16b, v18.16b
+    usubl           v4.8h, v17.8b, v19.8b
+    usubl2          v5.8h, v17.16b, v19.16b
+    smlal           v0.4s, v2.4h, v2.4h
+    smlal2          v1.4s, v2.8h, v2.8h
+    smlal           v0.4s, v3.4h, v3.4h
+    smlal2          v1.4s, v3.8h, v3.8h
​

x265_3.5.tar.gz/source/common/common.h -> x265_3.6.tar.gz/source/common/common.h Changed

@@ -130,7 +130,6 @@
 typedef uint64_t pixel4;
 typedef int64_t  ssum2_t;
 #define SHIFT_TO_BITPLANE 9
-#define HISTOGRAM_BINS 1024
 #else
 typedef uint8_t  pixel;
 typedef uint16_t sum_t;
@@ -138,7 +137,6 @@
 typedef uint32_t pixel4;
 typedef int32_t  ssum2_t; // Signed sum
 #define SHIFT_TO_BITPLANE 7
-#define HISTOGRAM_BINS 256
 #endif // if HIGH_BIT_DEPTH
 
 #if X265_DEPTH < 10
@@ -162,6 +160,8 @@
 
 #define MIN_QPSCALE     0.21249999999999999
 #define MAX_MAX_QPSCALE 615.46574234477100
+#define FRAME_BRIGHTNESS_THRESHOLD  50.0 // Min % of pixels in a frame, that are above BRIGHTNESS_THRESHOLD for it to be considered a bright frame
+#define FRAME_EDGE_THRESHOLD  10.0 // Min % of edge pixels in a frame, for it to be considered to have high edge density
 
 
 template<typename T>
@@ -340,6 +340,9 @@
 #define FILLER_OVERHEAD (NAL_TYPE_OVERHEAD + START_CODE_OVERHEAD + 1)
 
 #define MAX_NUM_DYN_REFINE          (NUM_CU_DEPTH * X265_REFINE_INTER_LEVELS)
+#define X265_BYTE 8
+
+#define MAX_MCSTF_TEMPORAL_WINDOW_LENGTH 8
 
 namespace X265_NS {
 
@@ -434,6 +437,14 @@
 #define  x265_unlink(fileName) unlink(fileName)
 #define  x265_rename(oldName, newName) rename(oldName, newName)
 #endif
+/* Close a file */
+#define  x265_fclose(file) if (file != NULL) fclose(file); file=NULL;
+#define x265_fread(val, size, readSize, fileOffset,errorMessage)\
+    if (fread(val, size, readSize, fileOffset) != readSize)\
+    {\
+        x265_log(NULL, X265_LOG_ERROR, errorMessage); \
+        return; \
+    }
 int      x265_exp2fix8(double x);
 
 double   x265_ssim2dB(double ssim);

 
@@ -130,7 +130,6 @@
 typedef uint64_t pixel4;
 typedef int64_t  ssum2_t;
 #define SHIFT_TO_BITPLANE 9
-#define HISTOGRAM_BINS 1024
 #else
 typedef uint8_t  pixel;
 typedef uint16_t sum_t;
@@ -138,7 +137,6 @@
 typedef uint32_t pixel4;
 typedef int32_t  ssum2_t; // Signed sum
 #define SHIFT_TO_BITPLANE 7
-#define HISTOGRAM_BINS 256
 #endif // if HIGH_BIT_DEPTH
 
 #if X265_DEPTH < 10
@@ -162,6 +160,8 @@
 
 #define MIN_QPSCALE     0.21249999999999999
 #define MAX_MAX_QPSCALE 615.46574234477100
+#define FRAME_BRIGHTNESS_THRESHOLD  50.0 // Min % of pixels in a frame, that are above BRIGHTNESS_THRESHOLD for it to be considered a bright frame
+#define FRAME_EDGE_THRESHOLD  10.0 // Min % of edge pixels in a frame, for it to be considered to have high edge density
 
 
 template<typename T>
@@ -340,6 +340,9 @@
 #define FILLER_OVERHEAD (NAL_TYPE_OVERHEAD + START_CODE_OVERHEAD + 1)
 
 #define MAX_NUM_DYN_REFINE          (NUM_CU_DEPTH * X265_REFINE_INTER_LEVELS)
+#define X265_BYTE 8
+
+#define MAX_MCSTF_TEMPORAL_WINDOW_LENGTH 8
 
 namespace X265_NS {
 
@@ -434,6 +437,14 @@
 #define  x265_unlink(fileName) unlink(fileName)
 #define  x265_rename(oldName, newName) rename(oldName, newName)
 #endif
+/* Close a file */
+#define  x265_fclose(file) if (file != NULL) fclose(file); file=NULL;
+#define x265_fread(val, size, readSize, fileOffset,errorMessage)\
+    if (fread(val, size, readSize, fileOffset) != readSize)\
+    {\
+        x265_log(NULL, X265_LOG_ERROR, errorMessage); \
+        return; \
+    }
 int      x265_exp2fix8(double x);
 
 double   x265_ssim2dB(double ssim);
​

x265_3.5.tar.gz/source/common/cpu.cpp -> x265_3.6.tar.gz/source/common/cpu.cpp Changed

@@ -7,6 +7,8 @@
  *          Steve Borho <steve@borho.org>
  *          Hongbin Liu <liuhongbin1@huawei.com>
  *          Yimeng Su <yimeng.su@huawei.com>
+ *          Josh Dekker <josh@itanimul.li>
+ *          Jean-Baptiste Kempf <jb@videolan.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -105,6 +107,14 @@
     { "NEON",            X265_CPU_NEON },
     { "FastNeonMRC",     X265_CPU_FAST_NEON_MRC },
 
+#elif X265_ARCH_ARM64
+    { "NEON",            X265_CPU_NEON },
+#if defined(HAVE_SVE)
+    { "SVE",            X265_CPU_SVE },
+#endif
+#if defined(HAVE_SVE2)
+    { "SVE2",            X265_CPU_SVE2 },
+#endif
 #elif X265_ARCH_POWER8
     { "Altivec",         X265_CPU_ALTIVEC },
 
@@ -369,12 +379,30 @@
     flags |= PFX(cpu_fast_neon_mrc_test)() ? X265_CPU_FAST_NEON_MRC : 0;
 #endif
     // TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc)
-#elif X265_ARCH_ARM64
-    flags |= X265_CPU_NEON;
 #endif // if HAVE_ARMV6
     return flags;
 }
 
+#elif X265_ARCH_ARM64
+
+uint32_t cpu_detect(bool benableavx512)
+{
+    int flags = 0;
+
+    #if defined(HAVE_SVE2)
+         flags |= X265_CPU_SVE2;
+         flags |= X265_CPU_SVE;
+         flags |= X265_CPU_NEON;
+    #elif defined(HAVE_SVE)
+         flags |= X265_CPU_SVE;
+         flags |= X265_CPU_NEON;
+    #elif HAVE_NEON
+         flags |= X265_CPU_NEON;
+    #endif
+        
+    return flags;
+}
+
 #elif X265_ARCH_POWER8
 
 uint32_t cpu_detect(bool benableavx512)

 
@@ -7,6 +7,8 @@
  *          Steve Borho <steve@borho.org>
  *          Hongbin Liu <liuhongbin1@huawei.com>
  *          Yimeng Su <yimeng.su@huawei.com>
+ *          Josh Dekker <josh@itanimul.li>
+ *          Jean-Baptiste Kempf <jb@videolan.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -105,6 +107,14 @@
     { "NEON",            X265_CPU_NEON },
     { "FastNeonMRC",     X265_CPU_FAST_NEON_MRC },
 
+#elif X265_ARCH_ARM64
+    { "NEON",            X265_CPU_NEON },
+#if defined(HAVE_SVE)
+    { "SVE",            X265_CPU_SVE },
+#endif
+#if defined(HAVE_SVE2)
+    { "SVE2",            X265_CPU_SVE2 },
+#endif
 #elif X265_ARCH_POWER8
     { "Altivec",         X265_CPU_ALTIVEC },
 
@@ -369,12 +379,30 @@
     flags |= PFX(cpu_fast_neon_mrc_test)() ? X265_CPU_FAST_NEON_MRC : 0;
 #endif
     // TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc)
-#elif X265_ARCH_ARM64
-    flags |= X265_CPU_NEON;
 #endif // if HAVE_ARMV6
     return flags;
 }
 
+#elif X265_ARCH_ARM64
+
+uint32_t cpu_detect(bool benableavx512)
+{
+    int flags = 0;
+
+    #if defined(HAVE_SVE2)
+         flags |= X265_CPU_SVE2;
+         flags |= X265_CPU_SVE;
+         flags |= X265_CPU_NEON;
+    #elif defined(HAVE_SVE)
+         flags |= X265_CPU_SVE;
+         flags |= X265_CPU_NEON;
+    #elif HAVE_NEON
+         flags |= X265_CPU_NEON;
+    #endif
+        
+    return flags;
+}
+
 #elif X265_ARCH_POWER8
 
 uint32_t cpu_detect(bool benableavx512)
​

x265_3.5.tar.gz/source/common/frame.cpp -> x265_3.6.tar.gz/source/common/frame.cpp Changed

@@ -64,12 +64,40 @@
     m_edgeBitPlane = NULL;
     m_edgeBitPic = NULL;
     m_isInsideWindow = 0;
+
+    // mcstf
+    m_isSubSampled = NULL;
+    m_mcstf = NULL;
+    m_refPicCnt0 = 0;
+    m_refPicCnt1 = 0;
+    m_nextMCSTF = NULL;
+    m_prevMCSTF = NULL;
+
+    m_tempLayer = 0;
+    m_sameLayerRefPic = false;
 }
 
 bool Frame::create(x265_param *param, float* quantOffsets)
 {
     m_fencPic = new PicYuv;
     m_param = param;
+
+    if (m_param->bEnableTemporalFilter)
+    {
+        m_mcstf = new TemporalFilter;
+        m_mcstf->init(param);
+
+        m_fencPicSubsampled2 = new PicYuv;
+        m_fencPicSubsampled4 = new PicYuv;
+
+        if (!m_fencPicSubsampled2->createScaledPicYUV(param, 2))
+            return false;
+        if (!m_fencPicSubsampled4->createScaledPicYUV(param, 4))
+            return false;
+
+        CHECKED_MALLOC_ZERO(m_isSubSampled, int, 1);
+    }
+
     CHECKED_MALLOC_ZERO(m_rcData, RcStats, 1);
 
     if (param->bCTUInfo)
@@ -151,6 +179,22 @@
     return false;
 }
 
+bool Frame::createSubSample()
+{
+
+    m_fencPicSubsampled2 = new PicYuv;
+    m_fencPicSubsampled4 = new PicYuv;
+
+    if (!m_fencPicSubsampled2->createScaledPicYUV(m_param, 2))
+        return false;
+    if (!m_fencPicSubsampled4->createScaledPicYUV(m_param, 4))
+        return false;
+    CHECKED_MALLOC_ZERO(m_isSubSampled, int, 1);
+    return true;
+fail:
+    return false;
+}
+
 bool Frame::allocEncodeData(x265_param *param, const SPS& sps)
 {
     m_encData = new FrameData;
@@ -207,6 +251,26 @@
         m_fencPic = NULL;
     }
 
+    if (m_param->bEnableTemporalFilter)
+    {
+
+        if (m_fencPicSubsampled2)
+        {
+            m_fencPicSubsampled2->destroy();
+            delete m_fencPicSubsampled2;
+            m_fencPicSubsampled2 = NULL;
+        }
+
+        if (m_fencPicSubsampled4)
+        {
+            m_fencPicSubsampled4->destroy();
+            delete m_fencPicSubsampled4;
+            m_fencPicSubsampled4 = NULL;
+        }
+        delete m_mcstf;
+        X265_FREE(m_isSubSampled);
+    }
+
     if (m_reconPic)
     {
         m_reconPic->destroy();
@@ -267,7 +331,8 @@
         X265_FREE(m_addOnPrevChange);
         m_addOnPrevChange = NULL;
     }
-    m_lowres.destroy();
+
+    m_lowres.destroy(m_param);
     X265_FREE(m_rcData);
 
     if (m_param->bDynamicRefine)

 
@@ -64,12 +64,40 @@
     m_edgeBitPlane = NULL;
     m_edgeBitPic = NULL;
     m_isInsideWindow = 0;
+
+    // mcstf
+    m_isSubSampled = NULL;
+    m_mcstf = NULL;
+    m_refPicCnt0 = 0;
+    m_refPicCnt1 = 0;
+    m_nextMCSTF = NULL;
+    m_prevMCSTF = NULL;
+
+    m_tempLayer = 0;
+    m_sameLayerRefPic = false;
 }
 
 bool Frame::create(x265_param *param, float* quantOffsets)
 {
     m_fencPic = new PicYuv;
     m_param = param;
+
+    if (m_param->bEnableTemporalFilter)
+    {
+        m_mcstf = new TemporalFilter;
+        m_mcstf->init(param);
+
+        m_fencPicSubsampled2 = new PicYuv;
+        m_fencPicSubsampled4 = new PicYuv;
+
+        if (!m_fencPicSubsampled2->createScaledPicYUV(param, 2))
+            return false;
+        if (!m_fencPicSubsampled4->createScaledPicYUV(param, 4))
+            return false;
+
+        CHECKED_MALLOC_ZERO(m_isSubSampled, int, 1);
+    }
+
     CHECKED_MALLOC_ZERO(m_rcData, RcStats, 1);
 
     if (param->bCTUInfo)
@@ -151,6 +179,22 @@
     return false;
 }
 
+bool Frame::createSubSample()
+{
+
+    m_fencPicSubsampled2 = new PicYuv;
+    m_fencPicSubsampled4 = new PicYuv;
+
+    if (!m_fencPicSubsampled2->createScaledPicYUV(m_param, 2))
+        return false;
+    if (!m_fencPicSubsampled4->createScaledPicYUV(m_param, 4))
+        return false;
+    CHECKED_MALLOC_ZERO(m_isSubSampled, int, 1);
+    return true;
+fail:
+    return false;
+}
+
 bool Frame::allocEncodeData(x265_param *param, const SPS& sps)
 {
     m_encData = new FrameData;
@@ -207,6 +251,26 @@
         m_fencPic = NULL;
     }
 
+    if (m_param->bEnableTemporalFilter)
+    {
+
+        if (m_fencPicSubsampled2)
+        {
+            m_fencPicSubsampled2->destroy();
+            delete m_fencPicSubsampled2;
+            m_fencPicSubsampled2 = NULL;
+        }
+
+        if (m_fencPicSubsampled4)
+        {
+            m_fencPicSubsampled4->destroy();
+            delete m_fencPicSubsampled4;
+            m_fencPicSubsampled4 = NULL;
+        }
+        delete m_mcstf;
+        X265_FREE(m_isSubSampled);
+    }
+
     if (m_reconPic)
     {
         m_reconPic->destroy();
@@ -267,7 +331,8 @@
         X265_FREE(m_addOnPrevChange);
         m_addOnPrevChange = NULL;
     }
-    m_lowres.destroy();
+
+    m_lowres.destroy(m_param);
     X265_FREE(m_rcData);
 
     if (m_param->bDynamicRefine)
​

x265_3.5.tar.gz/source/common/frame.h -> x265_3.6.tar.gz/source/common/frame.h Changed

@@ -28,6 +28,7 @@
 #include "common.h"
 #include "lowres.h"
 #include "threading.h"
+#include "temporalfilter.h"
 
 namespace X265_NS {
 // private namespace
@@ -70,6 +71,7 @@
     double   count4;
     double   offset4;
     double   bufferFillFinal;
+    int64_t  currentSatd;
 };
 
 class Frame
@@ -83,8 +85,12 @@
 
     /* Data associated with x265_picture */
     PicYuv*                m_fencPic;
+    PicYuv*                m_fencPicSubsampled2;
+    PicYuv*                m_fencPicSubsampled4;
+
     int                    m_poc;
     int                    m_encodeOrder;
+    int                    m_gopOffset;
     int64_t                m_pts;                // user provided presentation time stamp
     int64_t                m_reorderedPts;
     int64_t                m_dts;
@@ -132,6 +138,13 @@
     bool                   m_classifyFrame;
     int                    m_fieldNum;
 
+    /*MCSTF*/
+    TemporalFilter*        m_mcstf;
+    int                    m_refPicCnt2;
+    Frame*                 m_nextMCSTF;           // PicList doubly linked list pointers
+    Frame*                 m_prevMCSTF;
+    int*                   m_isSubSampled;
+
     /* aq-mode 4 : Gaussian, edge and theta frames for edge information */
     pixel*                 m_edgePic;
     pixel*                 m_gaussianPic;
@@ -143,9 +156,15 @@
 
     int                    m_isInsideWindow;
 
+    /*Frame's temporal layer info*/
+    uint8_t                m_tempLayer;
+    int8_t                 m_gopId;
+    bool                   m_sameLayerRefPic;
+
     Frame();
 
     bool create(x265_param *param, float* quantOffsets);
+    bool createSubSample();
     bool allocEncodeData(x265_param *param, const SPS& sps);
     void reinit(const SPS& sps);
     void destroy();

 
@@ -28,6 +28,7 @@
 #include "common.h"
 #include "lowres.h"
 #include "threading.h"
+#include "temporalfilter.h"
 
 namespace X265_NS {
 // private namespace
@@ -70,6 +71,7 @@
     double   count4;
     double   offset4;
     double   bufferFillFinal;
+    int64_t  currentSatd;
 };
 
 class Frame
@@ -83,8 +85,12 @@
 
     /* Data associated with x265_picture */
     PicYuv*                m_fencPic;
+    PicYuv*                m_fencPicSubsampled2;
+    PicYuv*                m_fencPicSubsampled4;
+
     int                    m_poc;
     int                    m_encodeOrder;
+    int                    m_gopOffset;
     int64_t                m_pts;                // user provided presentation time stamp
     int64_t                m_reorderedPts;
     int64_t                m_dts;
@@ -132,6 +138,13 @@
     bool                   m_classifyFrame;
     int                    m_fieldNum;
 
+    /*MCSTF*/
+    TemporalFilter*        m_mcstf;
+    int                    m_refPicCnt2;
+    Frame*                 m_nextMCSTF;           // PicList doubly linked list pointers
+    Frame*                 m_prevMCSTF;
+    int*                   m_isSubSampled;
+
     /* aq-mode 4 : Gaussian, edge and theta frames for edge information */
     pixel*                 m_edgePic;
     pixel*                 m_gaussianPic;
@@ -143,9 +156,15 @@
 
     int                    m_isInsideWindow;
 
+    /*Frame's temporal layer info*/
+    uint8_t                m_tempLayer;
+    int8_t                 m_gopId;
+    bool                   m_sameLayerRefPic;
+
     Frame();
 
     bool create(x265_param *param, float* quantOffsets);
+    bool createSubSample();
     bool allocEncodeData(x265_param *param, const SPS& sps);
     void reinit(const SPS& sps);
     void destroy();
​

x265_3.5.tar.gz/source/common/framedata.cpp -> x265_3.6.tar.gz/source/common/framedata.cpp Changed

 
@@ -62,7 +62,7 @@
     }
     else
         return false;
-    CHECKED_MALLOC_ZERO(m_cuStat, RCStatCU, sps.numCUsInFrame);
+    CHECKED_MALLOC_ZERO(m_cuStat, RCStatCU, sps.numCUsInFrame + 1);
     CHECKED_MALLOC(m_rowStat, RCStatRow, sps.numCuInHeight);
     reinit(sps);
     
​

x265_3.5.tar.gz/source/common/lowres.cpp -> x265_3.6.tar.gz/source/common/lowres.cpp Changed

@@ -28,6 +28,28 @@
 
 using namespace X265_NS;
 
+/*
+ * Down Sample input picture
+ */
+static
+void frame_lowres_core(const pixel* src0, pixel* dst0,
+    intptr_t src_stride, intptr_t dst_stride, int width, int height)
+{
+    for (int y = 0; y < height; y++)
+    {
+        const pixel* src1 = src0 + src_stride;
+        for (int x = 0; x < width; x++)
+        {
+            // slower than naive bilinear, but matches asm
+#define FILTER(a, b, c, d) ((((a + b + 1) >> 1) + ((c + d + 1) >> 1) + 1) >> 1)
+            dst0x = FILTER(src02 * x, src12 * x, src02 * x + 1, src12 * x + 1);
+#undef FILTER
+        }
+        src0 += src_stride * 2;
+        dst0 += dst_stride;
+    }
+}
+
 bool PicQPAdaptationLayer::create(uint32_t width, uint32_t height, uint32_t partWidth, uint32_t partHeight, uint32_t numAQPartInWidthExt, uint32_t numAQPartInHeightExt)
 {
     aqPartWidth = partWidth;
@@ -73,7 +95,7 @@
 
     size_t planesize = lumaStride * (lines + 2 * origPic->m_lumaMarginY);
     size_t padoffset = lumaStride * origPic->m_lumaMarginY + origPic->m_lumaMarginX;
-    if (!!param->rc.aqMode || !!param->rc.hevcAq || !!param->bAQMotion)
+    if (!!param->rc.aqMode || !!param->rc.hevcAq || !!param->bAQMotion || !!param->bEnableWeightedPred || !!param->bEnableWeightedBiPred)
     {
         CHECKED_MALLOC_ZERO(qpAqOffset, double, cuCountFullRes);
         CHECKED_MALLOC_ZERO(invQscaleFactor, int, cuCountFullRes);
@@ -190,13 +212,45 @@
         }
     }
 
+    if (param->bHistBasedSceneCut)
+    {
+        quarterSampleLowResWidth = widthFullRes / 4;
+        quarterSampleLowResHeight = heightFullRes / 4;
+        quarterSampleLowResOriginX = 16;
+        quarterSampleLowResOriginY = 16;
+        quarterSampleLowResStrideY = quarterSampleLowResWidth + 2 * quarterSampleLowResOriginY;
+
+        size_t quarterSampleLowResPlanesize = quarterSampleLowResStrideY * (quarterSampleLowResHeight + 2 * quarterSampleLowResOriginX);
+        /* allocate quarter sampled lowres buffers */
+        CHECKED_MALLOC_ZERO(quarterSampleLowResBuffer, pixel, quarterSampleLowResPlanesize);
+
+        // Allocate memory for Histograms
+        picHistogram = X265_MALLOC(uint32_t***, NUMBER_OF_SEGMENTS_IN_WIDTH * sizeof(uint32_t***));
+        picHistogram0 = X265_MALLOC(uint32_t**, NUMBER_OF_SEGMENTS_IN_WIDTH * NUMBER_OF_SEGMENTS_IN_HEIGHT);
+        for (uint32_t wd = 1; wd < NUMBER_OF_SEGMENTS_IN_WIDTH; wd++) {
+            picHistogramwd = picHistogram0 + wd * NUMBER_OF_SEGMENTS_IN_HEIGHT;
+        }
+
+        for (uint32_t regionInPictureWidthIndex = 0; regionInPictureWidthIndex < NUMBER_OF_SEGMENTS_IN_WIDTH; regionInPictureWidthIndex++)
+        {
+            for (uint32_t regionInPictureHeightIndex = 0; regionInPictureHeightIndex < NUMBER_OF_SEGMENTS_IN_HEIGHT; regionInPictureHeightIndex++)
+            {
+                picHistogramregionInPictureWidthIndexregionInPictureHeightIndex = X265_MALLOC(uint32_t*, NUMBER_OF_SEGMENTS_IN_WIDTH *sizeof(uint32_t*));
+                picHistogramregionInPictureWidthIndexregionInPictureHeightIndex0 = X265_MALLOC(uint32_t, 3 * HISTOGRAM_NUMBER_OF_BINS * sizeof(uint32_t));
+                for (uint32_t wd = 1; wd < 3; wd++) {
+                    picHistogramregionInPictureWidthIndexregionInPictureHeightIndexwd = picHistogramregionInPictureWidthIndexregionInPictureHeightIndex0 + wd * HISTOGRAM_NUMBER_OF_BINS;
+                }
+            }
+        }
+    }
+
     return true;
 
 fail:
     return false;
 }
 
-void Lowres::destroy()
+void Lowres::destroy(x265_param* param)
 {
     X265_FREE(buffer0);
     if(bEnableHME)
@@ -234,7 +288,8 @@
     X265_FREE(invQscaleFactor8x8);
     X265_FREE(edgeInclined);
     X265_FREE(qpAqMotionOffset);
-    X265_FREE(blockVariance);
+    if (param->bDynamicRefine || param->bEnableFades)
+        X265_FREE(blockVariance);
     if (maxAQDepth > 0)
     {
         for (uint32_t d = 0; d < 4; d++)
@@ -254,6 +309,29 @@
 
         delete pAQLayer;
     }
+
+    // Histograms
+    if (param->bHistBasedSceneCut)
+    {
+        for (uint32_t segmentInFrameWidthIdx = 0; segmentInFrameWidthIdx < NUMBER_OF_SEGMENTS_IN_WIDTH; segmentInFrameWidthIdx++)
+        {
+            if (picHistogramsegmentInFrameWidthIdx)
+            {
+                for (uint32_t segmentInFrameHeightIdx = 0; segmentInFrameHeightIdx < NUMBER_OF_SEGMENTS_IN_HEIGHT; segmentInFrameHeightIdx++)
+                {
+                    if (picHistogramsegmentInFrameWidthIdxsegmentInFrameHeightIdx)
+                        X265_FREE(picHistogramsegmentInFrameWidthIdxsegmentInFrameHeightIdx0);
+                    X265_FREE(picHistogramsegmentInFrameWidthIdxsegmentInFrameHeightIdx);
+                }
+            }
+        }
+        if (picHistogram)
+            X265_FREE(picHistogram0);
+        X265_FREE(picHistogram);
+
+        X265_FREE(quarterSampleLowResBuffer);
+
+    }
 }
 // (re) initialize lowres state
 void Lowres::init(PicYuv *origPic, int poc)
@@ -266,10 +344,6 @@
     indB = 0;
     memset(costEst, -1, sizeof(costEst));
     memset(weightedCostDelta, 0, sizeof(weightedCostDelta));
-    interPCostPercDiff = 0.0;
-    intraCostPercDiff = 0.0;
-    m_bIsMaxThres = false;
-    m_bIsHardScenecut = false;
 
     if (qpAqOffset && invQscaleFactor)
         memset(costEstAq, -1, sizeof(costEstAq));
@@ -314,4 +388,16 @@
     }
 
     fpelPlane0 = lowresPlane0;
+
+    if (origPic->m_param->bHistBasedSceneCut)
+    {
+        // Quarter Sampled Input Picture Formation
+        // TO DO: Replace with ASM function
+        frame_lowres_core(
+            lowresPlane0,
+            quarterSampleLowResBuffer + quarterSampleLowResOriginX + quarterSampleLowResOriginY * quarterSampleLowResStrideY,
+            lumaStride,
+            quarterSampleLowResStrideY,
+            widthFullRes / 4, heightFullRes / 4);
+    }
 }

 
@@ -28,6 +28,28 @@
 
 using namespace X265_NS;
 
+/*
+ * Down Sample input picture
+ */
+static
+void frame_lowres_core(const pixel* src0, pixel* dst0,
+    intptr_t src_stride, intptr_t dst_stride, int width, int height)
+{
+    for (int y = 0; y < height; y++)
+    {
+        const pixel* src1 = src0 + src_stride;
+        for (int x = 0; x < width; x++)
+        {
+            // slower than naive bilinear, but matches asm
+#define FILTER(a, b, c, d) ((((a + b + 1) >> 1) + ((c + d + 1) >> 1) + 1) >> 1)
+            dst0x = FILTER(src02 * x, src12 * x, src02 * x + 1, src12 * x + 1);
+#undef FILTER
+        }
+        src0 += src_stride * 2;
+        dst0 += dst_stride;
+    }
+}
+
 bool PicQPAdaptationLayer::create(uint32_t width, uint32_t height, uint32_t partWidth, uint32_t partHeight, uint32_t numAQPartInWidthExt, uint32_t numAQPartInHeightExt)
 {
     aqPartWidth = partWidth;
@@ -73,7 +95,7 @@
 
     size_t planesize = lumaStride * (lines + 2 * origPic->m_lumaMarginY);
     size_t padoffset = lumaStride * origPic->m_lumaMarginY + origPic->m_lumaMarginX;
-    if (!!param->rc.aqMode || !!param->rc.hevcAq || !!param->bAQMotion)
+    if (!!param->rc.aqMode || !!param->rc.hevcAq || !!param->bAQMotion || !!param->bEnableWeightedPred || !!param->bEnableWeightedBiPred)
     {
         CHECKED_MALLOC_ZERO(qpAqOffset, double, cuCountFullRes);
         CHECKED_MALLOC_ZERO(invQscaleFactor, int, cuCountFullRes);
@@ -190,13 +212,45 @@
         }
     }
 
+    if (param->bHistBasedSceneCut)
+    {
+        quarterSampleLowResWidth = widthFullRes / 4;
+        quarterSampleLowResHeight = heightFullRes / 4;
+        quarterSampleLowResOriginX = 16;
+        quarterSampleLowResOriginY = 16;
+        quarterSampleLowResStrideY = quarterSampleLowResWidth + 2 * quarterSampleLowResOriginY;
+
+        size_t quarterSampleLowResPlanesize = quarterSampleLowResStrideY * (quarterSampleLowResHeight + 2 * quarterSampleLowResOriginX);
+        /* allocate quarter sampled lowres buffers */
+        CHECKED_MALLOC_ZERO(quarterSampleLowResBuffer, pixel, quarterSampleLowResPlanesize);
+
+        // Allocate memory for Histograms
+        picHistogram = X265_MALLOC(uint32_t***, NUMBER_OF_SEGMENTS_IN_WIDTH * sizeof(uint32_t***));
+        picHistogram0 = X265_MALLOC(uint32_t**, NUMBER_OF_SEGMENTS_IN_WIDTH * NUMBER_OF_SEGMENTS_IN_HEIGHT);
+        for (uint32_t wd = 1; wd < NUMBER_OF_SEGMENTS_IN_WIDTH; wd++) {
+            picHistogramwd = picHistogram0 + wd * NUMBER_OF_SEGMENTS_IN_HEIGHT;
+        }
+
+        for (uint32_t regionInPictureWidthIndex = 0; regionInPictureWidthIndex < NUMBER_OF_SEGMENTS_IN_WIDTH; regionInPictureWidthIndex++)
+        {
+            for (uint32_t regionInPictureHeightIndex = 0; regionInPictureHeightIndex < NUMBER_OF_SEGMENTS_IN_HEIGHT; regionInPictureHeightIndex++)
+            {
+                picHistogramregionInPictureWidthIndexregionInPictureHeightIndex = X265_MALLOC(uint32_t*, NUMBER_OF_SEGMENTS_IN_WIDTH *sizeof(uint32_t*));
+                picHistogramregionInPictureWidthIndexregionInPictureHeightIndex0 = X265_MALLOC(uint32_t, 3 * HISTOGRAM_NUMBER_OF_BINS * sizeof(uint32_t));
+                for (uint32_t wd = 1; wd < 3; wd++) {
+                    picHistogramregionInPictureWidthIndexregionInPictureHeightIndexwd = picHistogramregionInPictureWidthIndexregionInPictureHeightIndex0 + wd * HISTOGRAM_NUMBER_OF_BINS;
+                }
+            }
+        }
+    }
+
     return true;
 
 fail:
     return false;
 }
 
-void Lowres::destroy()
+void Lowres::destroy(x265_param* param)
 {
     X265_FREE(buffer0);
     if(bEnableHME)
@@ -234,7 +288,8 @@
     X265_FREE(invQscaleFactor8x8);
     X265_FREE(edgeInclined);
     X265_FREE(qpAqMotionOffset);
-    X265_FREE(blockVariance);
+    if (param->bDynamicRefine || param->bEnableFades)
+        X265_FREE(blockVariance);
     if (maxAQDepth > 0)
     {
         for (uint32_t d = 0; d < 4; d++)
@@ -254,6 +309,29 @@
 
         delete pAQLayer;
     }
+
+    // Histograms
+    if (param->bHistBasedSceneCut)
+    {
+        for (uint32_t segmentInFrameWidthIdx = 0; segmentInFrameWidthIdx < NUMBER_OF_SEGMENTS_IN_WIDTH; segmentInFrameWidthIdx++)
+        {
+            if (picHistogramsegmentInFrameWidthIdx)
+            {
+                for (uint32_t segmentInFrameHeightIdx = 0; segmentInFrameHeightIdx < NUMBER_OF_SEGMENTS_IN_HEIGHT; segmentInFrameHeightIdx++)
+                {
+                    if (picHistogramsegmentInFrameWidthIdxsegmentInFrameHeightIdx)
+                        X265_FREE(picHistogramsegmentInFrameWidthIdxsegmentInFrameHeightIdx0);
+                    X265_FREE(picHistogramsegmentInFrameWidthIdxsegmentInFrameHeightIdx);
+                }
+            }
+        }
+        if (picHistogram)
+            X265_FREE(picHistogram0);
+        X265_FREE(picHistogram);
+
+        X265_FREE(quarterSampleLowResBuffer);
+
+    }
 }
 // (re) initialize lowres state
 void Lowres::init(PicYuv *origPic, int poc)
@@ -266,10 +344,6 @@
     indB = 0;
     memset(costEst, -1, sizeof(costEst));
     memset(weightedCostDelta, 0, sizeof(weightedCostDelta));
-    interPCostPercDiff = 0.0;
-    intraCostPercDiff = 0.0;
-    m_bIsMaxThres = false;
-    m_bIsHardScenecut = false;
 
     if (qpAqOffset && invQscaleFactor)
         memset(costEstAq, -1, sizeof(costEstAq));
@@ -314,4 +388,16 @@
     }
 
     fpelPlane0 = lowresPlane0;
+
+    if (origPic->m_param->bHistBasedSceneCut)
+    {
+        // Quarter Sampled Input Picture Formation
+        // TO DO: Replace with ASM function
+        frame_lowres_core(
+            lowresPlane0,
+            quarterSampleLowResBuffer + quarterSampleLowResOriginX + quarterSampleLowResOriginY * quarterSampleLowResStrideY,
+            lumaStride,
+            quarterSampleLowResStrideY,
+            widthFullRes / 4, heightFullRes / 4);
+    }
 }
​

x265_3.5.tar.gz/source/common/lowres.h -> x265_3.6.tar.gz/source/common/lowres.h Changed

@@ -32,6 +32,10 @@
 namespace X265_NS {
 // private namespace
 
+#define HISTOGRAM_NUMBER_OF_BINS         256
+#define NUMBER_OF_SEGMENTS_IN_WIDTH      4
+#define NUMBER_OF_SEGMENTS_IN_HEIGHT     4
+
 struct ReferencePlanes
 {
     ReferencePlanes() { memset(this, 0, sizeof(ReferencePlanes)); }
@@ -171,6 +175,7 @@
 
     int    frameNum;         // Presentation frame number
     int    sliceType;        // Slice type decided by lookahead
+    int    sliceTypeReq;     // Slice type required as per the QP file
     int    width;            // width of lowres frame in pixels
     int    lines;            // height of lowres frame in pixel lines
     int    leadingBframes;   // number of leading B frames for P or I
@@ -214,13 +219,13 @@
     double*   qpAqOffset;      // AQ QP offset values for each 16x16 CU
     double*   qpCuTreeOffset;  // cuTree QP offset values for each 16x16 CU
     double*   qpAqMotionOffset;
-    int*      invQscaleFactor; // qScale values for qp Aq Offsets
+    int*      invQscaleFactor;    // qScale values for qp Aq Offsets
     int*      invQscaleFactor8x8; // temporary buffer for qg-size 8
     uint32_t* blockVariance;
     uint64_t  wp_ssd3;       // This is different than SSDY, this is sum(pixel^2) - sum(pixel)^2 for entire frame
     uint64_t  wp_sum3;
     double    frameVariance;
-    int* edgeInclined;
+    int*      edgeInclined;
 
 
     /* cutree intermediate data */
@@ -230,18 +235,30 @@
     uint32_t heightFullRes;
     uint32_t m_maxCUSize;
     uint32_t m_qgSize;
-    
+
     uint16_t* propagateCost;
     double    weightedCostDeltaX265_BFRAME_MAX + 2;
     ReferencePlanes weightedRefX265_BFRAME_MAX + 2;
+
     /* For hist-based scenecut */
-    bool   m_bIsMaxThres;
-    double interPCostPercDiff;
-    double intraCostPercDiff;
-    bool   m_bIsHardScenecut;
+    int          quarterSampleLowResWidth;     // width of 1/4 lowres frame in pixels
+    int          quarterSampleLowResHeight;    // height of 1/4 lowres frame in pixels
+    int          quarterSampleLowResStrideY;
+    int          quarterSampleLowResOriginX;
+    int          quarterSampleLowResOriginY;
+    pixel       *quarterSampleLowResBuffer;
+    bool         bHistScenecutAnalyzed;
+
+    uint16_t     picAvgVariance;
+    uint16_t     picAvgVarianceCb;
+    uint16_t     picAvgVarianceCr;
+
+    uint32_t ****picHistogram;
+    uint64_t     averageIntensityPerSegmentNUMBER_OF_SEGMENTS_IN_WIDTHNUMBER_OF_SEGMENTS_IN_HEIGHT3;
+    uint8_t      averageIntensity3;
 
     bool create(x265_param* param, PicYuv *origPic, uint32_t qgSize);
-    void destroy();
+    void destroy(x265_param* param);
     void init(PicYuv *origPic, int poc);
 };
 }

 
@@ -32,6 +32,10 @@
 namespace X265_NS {
 // private namespace
 
+#define HISTOGRAM_NUMBER_OF_BINS         256
+#define NUMBER_OF_SEGMENTS_IN_WIDTH      4
+#define NUMBER_OF_SEGMENTS_IN_HEIGHT     4
+
 struct ReferencePlanes
 {
     ReferencePlanes() { memset(this, 0, sizeof(ReferencePlanes)); }
@@ -171,6 +175,7 @@
 
     int    frameNum;         // Presentation frame number
     int    sliceType;        // Slice type decided by lookahead
+    int    sliceTypeReq;     // Slice type required as per the QP file
     int    width;            // width of lowres frame in pixels
     int    lines;            // height of lowres frame in pixel lines
     int    leadingBframes;   // number of leading B frames for P or I
@@ -214,13 +219,13 @@
     double*   qpAqOffset;      // AQ QP offset values for each 16x16 CU
     double*   qpCuTreeOffset;  // cuTree QP offset values for each 16x16 CU
     double*   qpAqMotionOffset;
-    int*      invQscaleFactor; // qScale values for qp Aq Offsets
+    int*      invQscaleFactor;    // qScale values for qp Aq Offsets
     int*      invQscaleFactor8x8; // temporary buffer for qg-size 8
     uint32_t* blockVariance;
     uint64_t  wp_ssd3;       // This is different than SSDY, this is sum(pixel^2) - sum(pixel)^2 for entire frame
     uint64_t  wp_sum3;
     double    frameVariance;
-    int* edgeInclined;
+    int*      edgeInclined;
 
 
     /* cutree intermediate data */
@@ -230,18 +235,30 @@
     uint32_t heightFullRes;
     uint32_t m_maxCUSize;
     uint32_t m_qgSize;
-    
+
     uint16_t* propagateCost;
     double    weightedCostDeltaX265_BFRAME_MAX + 2;
     ReferencePlanes weightedRefX265_BFRAME_MAX + 2;
+
     /* For hist-based scenecut */
-    bool   m_bIsMaxThres;
-    double interPCostPercDiff;
-    double intraCostPercDiff;
-    bool   m_bIsHardScenecut;
+    int          quarterSampleLowResWidth;     // width of 1/4 lowres frame in pixels
+    int          quarterSampleLowResHeight;    // height of 1/4 lowres frame in pixels
+    int          quarterSampleLowResStrideY;
+    int          quarterSampleLowResOriginX;
+    int          quarterSampleLowResOriginY;
+    pixel       *quarterSampleLowResBuffer;
+    bool         bHistScenecutAnalyzed;
+
+    uint16_t     picAvgVariance;
+    uint16_t     picAvgVarianceCb;
+    uint16_t     picAvgVarianceCr;
+
+    uint32_t ****picHistogram;
+    uint64_t     averageIntensityPerSegmentNUMBER_OF_SEGMENTS_IN_WIDTHNUMBER_OF_SEGMENTS_IN_HEIGHT3;
+    uint8_t      averageIntensity3;
 
     bool create(x265_param* param, PicYuv *origPic, uint32_t qgSize);
-    void destroy();
+    void destroy(x265_param* param);
     void init(PicYuv *origPic, int poc);
 };
 }
​

x265_3.5.tar.gz/source/common/mv.h -> x265_3.6.tar.gz/source/common/mv.h Changed

 
@@ -105,6 +105,8 @@
     {
         return x >= _min.x && x <= _max.x && y >= _min.y && y <= _max.y;
     }
+
+    void set(int32_t _x, int32_t _y) { x = _x; y = _y; }
 };
 }
 
​

x265_3.5.tar.gz/source/common/param.cpp -> x265_3.6.tar.gz/source/common/param.cpp Changed

@@ -145,6 +145,8 @@
     param->bAnnexB = 1;
     param->bRepeatHeaders = 0;
     param->bEnableAccessUnitDelimiters = 0;
+    param->bEnableEndOfBitstream = 0;
+    param->bEnableEndOfSequence = 0;
     param->bEmitHRDSEI = 0;
     param->bEmitInfoSEI = 1;
     param->bEmitHDRSEI = 0; /*Deprecated*/
@@ -163,12 +165,12 @@
     param->keyframeMax = 250;
     param->gopLookahead = 0;
     param->bOpenGOP = 1;
+	param->craNal = 0;
     param->bframes = 4;
     param->lookaheadDepth = 20;
     param->bFrameAdaptive = X265_B_ADAPT_TRELLIS;
     param->bBPyramid = 1;
     param->scenecutThreshold = 40; /* Magic number pulled in from x264 */
-    param->edgeTransitionThreshold = 0.03;
     param->bHistBasedSceneCut = 0;
     param->lookaheadSlices = 8;
     param->lookaheadThreads = 0;
@@ -179,12 +181,20 @@
     param->bEnableHRDConcatFlag = 0;
     param->bEnableFades = 0;
     param->bEnableSceneCutAwareQp = 0;
-    param->fwdScenecutWindow = 500;
-    param->fwdRefQpDelta = 5;
-    param->fwdNonRefQpDelta = param->fwdRefQpDelta + (SLICE_TYPE_DELTA * param->fwdRefQpDelta);
-    param->bwdScenecutWindow = 100;
-    param->bwdRefQpDelta = -1;
-    param->bwdNonRefQpDelta = -1;
+    param->fwdMaxScenecutWindow = 1200;
+    param->bwdMaxScenecutWindow = 600;
+    for (int i = 0; i < 6; i++)
+    {
+        int deltas6 = { 5, 4, 3, 2, 1, 0 };
+
+        param->fwdScenecutWindowi = 200;
+        param->fwdRefQpDeltai = deltasi;
+        param->fwdNonRefQpDeltai = param->fwdRefQpDeltai + (SLICE_TYPE_DELTA * param->fwdRefQpDeltai);
+
+        param->bwdScenecutWindowi = 100;
+        param->bwdRefQpDeltai = -1;
+        param->bwdNonRefQpDeltai = -1;
+    }
 
     /* Intra Coding Tools */
     param->bEnableConstrainedIntra = 0;
@@ -278,7 +288,10 @@
     param->rc.rfConstantMin = 0;
     param->rc.bStatRead = 0;
     param->rc.bStatWrite = 0;
+    param->rc.dataShareMode = X265_SHARE_MODE_FILE;
     param->rc.statFileName = NULL;
+    param->rc.sharedMemName = NULL;
+    param->rc.bEncFocusedFramesOnly = 0;
     param->rc.complexityBlur = 20;
     param->rc.qblur = 0.5;
     param->rc.zoneCount = 0;
@@ -321,6 +334,7 @@
     param->maxLuma = PIXEL_MAX;
     param->log2MaxPocLsb = 8;
     param->maxSlices = 1;
+    param->videoSignalTypePreset = NULL;
 
     /*Conformance window*/
     param->confWinRightOffset = 0;
@@ -373,10 +387,17 @@
     param->bEnableSvtHevc = 0;
     param->svtHevcParam = NULL;
 
+    /* MCSTF */
+    param->bEnableTemporalFilter = 0;
+    param->temporalFilterStrength = 0.95;
+
 #ifdef SVT_HEVC
     param->svtHevcParam = svtParam;
     svt_param_default(param);
 #endif
+    /* Film grain characteristics model filename */
+    param->filmGrain = NULL;
+    param->bEnableSBRC = 0;
 }
 
 int x265_param_default_preset(x265_param* param, const char* preset, const char* tune)
@@ -666,6 +687,46 @@
 #define atof(str) x265_atof(str, bError)
 #define atobool(str) (x265_atobool(str, bError))
 
+int x265_scenecut_aware_qp_param_parse(x265_param* p, const char* name, const char* value)
+{
+    bool bError = false;
+    char nameBuf64;
+    if (!name)
+        return X265_PARAM_BAD_NAME;
+    // skip -- prefix if provided
+    if (name0 == '-' && name1 == '-')
+        name += 2;
+    // s/_/-/g
+    if (strlen(name) + 1 < sizeof(nameBuf) && strchr(name, '_'))
+    {
+        char *c;
+        strcpy(nameBuf, name);
+        while ((c = strchr(nameBuf, '_')) != 0)
+            *c = '-';
+        name = nameBuf;
+    }
+    if (!value)
+        value = "true";
+    else if (value0 == '=')
+        value++;
+#define OPT(STR) else if (!strcmp(name, STR))
+    if (0);
+    OPT("scenecut-aware-qp") p->bEnableSceneCutAwareQp = x265_atoi(value, bError);
+    OPT("masking-strength") bError = parseMaskingStrength(p, value);
+    else
+        return X265_PARAM_BAD_NAME;
+#undef OPT
+    return bError ? X265_PARAM_BAD_VALUE : 0;
+}
+
+
+/* internal versions of string-to-int with additional error checking */
+#undef atoi
+#undef atof
+#define atoi(str) x265_atoi(str, bError)
+#define atof(str) x265_atof(str, bError)
+#define atobool(str) (x265_atobool(str, bError))
+
 int x265_zone_param_parse(x265_param* p, const char* name, const char* value)
 {
     bool bError = false;
@@ -949,10 +1010,9 @@
        {
            bError = false;
            p->scenecutThreshold = atoi(value);
-           p->bHistBasedSceneCut = 0;
        }
     }
-    OPT("temporal-layers") p->bEnableTemporalSubLayers = atobool(value);
+    OPT("temporal-layers") p->bEnableTemporalSubLayers = atoi(value);
     OPT("keyint") p->keyframeMax = atoi(value);
     OPT("min-keyint") p->keyframeMin = atoi(value);
     OPT("rc-lookahead") p->lookaheadDepth = atoi(value);
@@ -1184,6 +1244,7 @@
         int pass = x265_clip3(0, 3, atoi(value));
         p->rc.bStatWrite = pass & 1;
         p->rc.bStatRead = pass & 2;
+        p->rc.dataShareMode = X265_SHARE_MODE_FILE;
     }
     OPT("stats") p->rc.statFileName = strdup(value);
     OPT("scaling-list") p->scalingLists = strdup(value);
@@ -1216,21 +1277,7 @@
         OPT("opt-ref-list-length-pps") p->bOptRefListLengthPPS = atobool(value);
         OPT("multi-pass-opt-rps") p->bMultiPassOptRPS = atobool(value);
         OPT("scenecut-bias") p->scenecutBias = atof(value);
-        OPT("hist-scenecut")
-        {
-            p->bHistBasedSceneCut = atobool(value);
-            if (bError)
-            {
-                bError = false;
-                p->bHistBasedSceneCut = 0;
-            }
-            if (p->bHistBasedSceneCut)
-            {
-                bError = false;
-                p->scenecutThreshold = 0;
-            }
-        }
-        OPT("hist-threshold") p->edgeTransitionThreshold = atof(value);
+        OPT("hist-scenecut") p->bHistBasedSceneCut = atobool(value);
         OPT("rskip-edge-threshold") p->edgeVarThreshold = atoi(value)/100.0f;
         OPT("lookahead-threads") p->lookaheadThreads = atoi(value);
         OPT("opt-cu-delta-qp") p->bOptCUDeltaQP = atobool(value);
@@ -1238,6 +1285,7 @@
         OPT("multi-pass-opt-distortion") p->analysisMultiPassDistortion = atobool(value);
         OPT("aq-motion") p->bAQMotion = atobool(value);
         OPT("dynamic-rd") p->dynamicRd = atof(value);
+		OPT("cra-nal") p->craNal = atobool(value);
         OPT("analysis-reuse-level")
         {
             p->analysisReuseLevel = atoi(value);
@@ -1348,71 +1396,7 @@
         }
         OPT("fades") p->bEnableFades = atobool(value);
         OPT("scenecut-aware-qp") p->bEnableSceneCutAwareQp = atoi(value);
-        OPT("masking-strength")
-        {
-            int window1;
-            double refQpDelta1, nonRefQpDelta1;
-
-            if (p->bEnableSceneCutAwareQp == FORWARD)
-            {
-                if (3 == sscanf(value, "%d,%lf,%lf", &window1, &refQpDelta1, &nonRefQpDelta1))
-                {
-                    if (window1 > 0)
-                        p->fwdScenecutWindow = window1;

 
@@ -145,6 +145,8 @@
     param->bAnnexB = 1;
     param->bRepeatHeaders = 0;
     param->bEnableAccessUnitDelimiters = 0;
+    param->bEnableEndOfBitstream = 0;
+    param->bEnableEndOfSequence = 0;
     param->bEmitHRDSEI = 0;
     param->bEmitInfoSEI = 1;
     param->bEmitHDRSEI = 0; /*Deprecated*/
@@ -163,12 +165,12 @@
     param->keyframeMax = 250;
     param->gopLookahead = 0;
     param->bOpenGOP = 1;
+   param->craNal = 0;
     param->bframes = 4;
     param->lookaheadDepth = 20;
     param->bFrameAdaptive = X265_B_ADAPT_TRELLIS;
     param->bBPyramid = 1;
     param->scenecutThreshold = 40; /* Magic number pulled in from x264 */
-    param->edgeTransitionThreshold = 0.03;
     param->bHistBasedSceneCut = 0;
     param->lookaheadSlices = 8;
     param->lookaheadThreads = 0;
@@ -179,12 +181,20 @@
     param->bEnableHRDConcatFlag = 0;
     param->bEnableFades = 0;
     param->bEnableSceneCutAwareQp = 0;
-    param->fwdScenecutWindow = 500;
-    param->fwdRefQpDelta = 5;
-    param->fwdNonRefQpDelta = param->fwdRefQpDelta + (SLICE_TYPE_DELTA * param->fwdRefQpDelta);
-    param->bwdScenecutWindow = 100;
-    param->bwdRefQpDelta = -1;
-    param->bwdNonRefQpDelta = -1;
+    param->fwdMaxScenecutWindow = 1200;
+    param->bwdMaxScenecutWindow = 600;
+    for (int i = 0; i < 6; i++)
+    {
+        int deltas6 = { 5, 4, 3, 2, 1, 0 };
+
+        param->fwdScenecutWindowi = 200;
+        param->fwdRefQpDeltai = deltasi;
+        param->fwdNonRefQpDeltai = param->fwdRefQpDeltai + (SLICE_TYPE_DELTA * param->fwdRefQpDeltai);
+
+        param->bwdScenecutWindowi = 100;
+        param->bwdRefQpDeltai = -1;
+        param->bwdNonRefQpDeltai = -1;
+    }
 
     /* Intra Coding Tools */
     param->bEnableConstrainedIntra = 0;
@@ -278,7 +288,10 @@
     param->rc.rfConstantMin = 0;
     param->rc.bStatRead = 0;
     param->rc.bStatWrite = 0;
+    param->rc.dataShareMode = X265_SHARE_MODE_FILE;
     param->rc.statFileName = NULL;
+    param->rc.sharedMemName = NULL;
+    param->rc.bEncFocusedFramesOnly = 0;
     param->rc.complexityBlur = 20;
     param->rc.qblur = 0.5;
     param->rc.zoneCount = 0;
@@ -321,6 +334,7 @@
     param->maxLuma = PIXEL_MAX;
     param->log2MaxPocLsb = 8;
     param->maxSlices = 1;
+    param->videoSignalTypePreset = NULL;
 
     /*Conformance window*/
     param->confWinRightOffset = 0;
@@ -373,10 +387,17 @@
     param->bEnableSvtHevc = 0;
     param->svtHevcParam = NULL;
 
+    /* MCSTF */
+    param->bEnableTemporalFilter = 0;
+    param->temporalFilterStrength = 0.95;
+
 #ifdef SVT_HEVC
     param->svtHevcParam = svtParam;
     svt_param_default(param);
 #endif
+    /* Film grain characteristics model filename */
+    param->filmGrain = NULL;
+    param->bEnableSBRC = 0;
 }
 
 int x265_param_default_preset(x265_param* param, const char* preset, const char* tune)
@@ -666,6 +687,46 @@
 #define atof(str) x265_atof(str, bError)
 #define atobool(str) (x265_atobool(str, bError))
 
+int x265_scenecut_aware_qp_param_parse(x265_param* p, const char* name, const char* value)
+{
+    bool bError = false;
+    char nameBuf64;
+    if (!name)
+        return X265_PARAM_BAD_NAME;
+    // skip -- prefix if provided
+    if (name0 == '-' && name1 == '-')
+        name += 2;
+    // s/_/-/g
+    if (strlen(name) + 1 < sizeof(nameBuf) && strchr(name, '_'))
+    {
+        char *c;
+        strcpy(nameBuf, name);
+        while ((c = strchr(nameBuf, '_')) != 0)
+            *c = '-';
+        name = nameBuf;
+    }
+    if (!value)
+        value = "true";
+    else if (value0 == '=')
+        value++;
+#define OPT(STR) else if (!strcmp(name, STR))
+    if (0);
+    OPT("scenecut-aware-qp") p->bEnableSceneCutAwareQp = x265_atoi(value, bError);
+    OPT("masking-strength") bError = parseMaskingStrength(p, value);
+    else
+        return X265_PARAM_BAD_NAME;
+#undef OPT
+    return bError ? X265_PARAM_BAD_VALUE : 0;
+}
+
+
+/* internal versions of string-to-int with additional error checking */
+#undef atoi
+#undef atof
+#define atoi(str) x265_atoi(str, bError)
+#define atof(str) x265_atof(str, bError)
+#define atobool(str) (x265_atobool(str, bError))
+
 int x265_zone_param_parse(x265_param* p, const char* name, const char* value)
 {
     bool bError = false;
@@ -949,10 +1010,9 @@
        {
            bError = false;
            p->scenecutThreshold = atoi(value);
-           p->bHistBasedSceneCut = 0;
        }
     }
-    OPT("temporal-layers") p->bEnableTemporalSubLayers = atobool(value);
+    OPT("temporal-layers") p->bEnableTemporalSubLayers = atoi(value);
     OPT("keyint") p->keyframeMax = atoi(value);
     OPT("min-keyint") p->keyframeMin = atoi(value);
     OPT("rc-lookahead") p->lookaheadDepth = atoi(value);
@@ -1184,6 +1244,7 @@
         int pass = x265_clip3(0, 3, atoi(value));
         p->rc.bStatWrite = pass & 1;
         p->rc.bStatRead = pass & 2;
+        p->rc.dataShareMode = X265_SHARE_MODE_FILE;
     }
     OPT("stats") p->rc.statFileName = strdup(value);
     OPT("scaling-list") p->scalingLists = strdup(value);
@@ -1216,21 +1277,7 @@
         OPT("opt-ref-list-length-pps") p->bOptRefListLengthPPS = atobool(value);
         OPT("multi-pass-opt-rps") p->bMultiPassOptRPS = atobool(value);
         OPT("scenecut-bias") p->scenecutBias = atof(value);
-        OPT("hist-scenecut")
-        {
-            p->bHistBasedSceneCut = atobool(value);
-            if (bError)
-            {
-                bError = false;
-                p->bHistBasedSceneCut = 0;
-            }
-            if (p->bHistBasedSceneCut)
-            {
-                bError = false;
-                p->scenecutThreshold = 0;
-            }
-        }
-        OPT("hist-threshold") p->edgeTransitionThreshold = atof(value);
+        OPT("hist-scenecut") p->bHistBasedSceneCut = atobool(value);
         OPT("rskip-edge-threshold") p->edgeVarThreshold = atoi(value)/100.0f;
         OPT("lookahead-threads") p->lookaheadThreads = atoi(value);
         OPT("opt-cu-delta-qp") p->bOptCUDeltaQP = atobool(value);
@@ -1238,6 +1285,7 @@
         OPT("multi-pass-opt-distortion") p->analysisMultiPassDistortion = atobool(value);
         OPT("aq-motion") p->bAQMotion = atobool(value);
         OPT("dynamic-rd") p->dynamicRd = atof(value);
+       OPT("cra-nal") p->craNal = atobool(value);
         OPT("analysis-reuse-level")
         {
             p->analysisReuseLevel = atoi(value);
@@ -1348,71 +1396,7 @@
         }
         OPT("fades") p->bEnableFades = atobool(value);
         OPT("scenecut-aware-qp") p->bEnableSceneCutAwareQp = atoi(value);
-        OPT("masking-strength")
-        {
-            int window1;
-            double refQpDelta1, nonRefQpDelta1;
-
-            if (p->bEnableSceneCutAwareQp == FORWARD)
-            {
-                if (3 == sscanf(value, "%d,%lf,%lf", &window1, &refQpDelta1, &nonRefQpDelta1))
-                {
-                    if (window1 > 0)
-                        p->fwdScenecutWindow = window1;
​

x265_3.5.tar.gz/source/common/param.h -> x265_3.6.tar.gz/source/common/param.h Changed

 
@@ -38,6 +38,7 @@
 void  getParamAspectRatio(x265_param *p, int& width, int& height);
 bool  parseLambdaFile(x265_param *param);
 void x265_copy_params(x265_param* dst, x265_param* src);
+bool parseMaskingStrength(x265_param* p, const char* value);
 
 /* this table is kept internal to avoid confusion, since log level indices start at -1 */
 static const char * const logLevelNames = { "none", "error", "warning", "info", "debug", "full", 0 };
@@ -52,6 +53,7 @@
 int x265_param_default_preset(x265_param *, const char *preset, const char *tune);
 int x265_param_apply_profile(x265_param *, const char *profile);
 int x265_param_parse(x265_param *p, const char *name, const char *value);
+int x265_scenecut_aware_qp_param_parse(x265_param* p, const char* name, const char* value);
 int x265_zone_param_parse(x265_param* p, const char* name, const char* value);
 #define PARAM_NS X265_NS
 #endif
​

x265_3.5.tar.gz/source/common/piclist.cpp -> x265_3.6.tar.gz/source/common/piclist.cpp Changed

@@ -45,6 +45,25 @@
     m_count++;
 }
 
+void PicList::pushFrontMCSTF(Frame& curFrame)
+{
+    X265_CHECK(!curFrame.m_nextMCSTF && !curFrame.m_nextMCSTF, "piclist: picture already in OPB list\n"); // ensure frame is not in a list
+    curFrame.m_nextMCSTF = m_start;
+    curFrame.m_prevMCSTF = NULL;
+
+    if (m_count)
+    {
+        m_start->m_prevMCSTF = &curFrame;
+        m_start = &curFrame;
+    }
+    else
+    {
+        m_start = m_end = &curFrame;
+    }
+    m_count++;
+
+}
+
 void PicList::pushBack(Frame& curFrame)
 {
     X265_CHECK(!curFrame.m_next && !curFrame.m_prev, "piclist: picture already in list\n"); // ensure frame is not in a list
@@ -63,6 +82,24 @@
     m_count++;
 }
 
+void PicList::pushBackMCSTF(Frame& curFrame)
+{
+    X265_CHECK(!curFrame.m_nextMCSTF && !curFrame.m_prevMCSTF, "piclist: picture already in OPB list\n"); // ensure frame is not in a list
+    curFrame.m_nextMCSTF = NULL;
+    curFrame.m_prevMCSTF = m_end;
+
+    if (m_count)
+    {
+        m_end->m_nextMCSTF = &curFrame;
+        m_end = &curFrame;
+    }
+    else
+    {
+        m_start = m_end = &curFrame;
+    }
+    m_count++;
+}
+
 Frame *PicList::popFront()
 {
     if (m_start)
@@ -94,6 +131,14 @@
     return curFrame;
 }
 
+Frame* PicList::getPOCMCSTF(int poc)
+{
+    Frame *curFrame = m_start;
+    while (curFrame && curFrame->m_poc != poc)
+        curFrame = curFrame->m_nextMCSTF;
+    return curFrame;
+}
+
 Frame *PicList::popBack()
 {
     if (m_end)
@@ -117,6 +162,29 @@
         return NULL;
 }
 
+Frame *PicList::popBackMCSTF()
+{
+    if (m_end)
+    {
+        Frame* temp = m_end;
+        m_count--;
+
+        if (m_count)
+        {
+            m_end = m_end->m_prevMCSTF;
+            m_end->m_nextMCSTF = NULL;
+        }
+        else
+        {
+            m_start = m_end = NULL;
+        }
+        temp->m_nextMCSTF = temp->m_prevMCSTF = NULL;
+        return temp;
+    }
+    else
+        return NULL;
+}
+
 Frame* PicList::getCurFrame(void)
 {
     Frame *curFrame = m_start;
@@ -158,3 +226,36 @@
 
     curFrame.m_next = curFrame.m_prev = NULL;
 }
+
+void PicList::removeMCSTF(Frame& curFrame)
+{
+#if _DEBUG
+    Frame *tmp = m_start;
+    while (tmp && tmp != &curFrame)
+    {
+        tmp = tmp->m_nextMCSTF;
+    }
+
+    X265_CHECK(tmp == &curFrame, "framelist: pic being removed was not in list\n"); // verify pic is in this list
+#endif
+
+    m_count--;
+    if (m_count)
+    {
+        if (m_start == &curFrame)
+            m_start = curFrame.m_nextMCSTF;
+        if (m_end == &curFrame)
+            m_end = curFrame.m_prevMCSTF;
+
+        if (curFrame.m_nextMCSTF)
+            curFrame.m_nextMCSTF->m_prevMCSTF = curFrame.m_prevMCSTF;
+        if (curFrame.m_prevMCSTF)
+            curFrame.m_prevMCSTF->m_nextMCSTF = curFrame.m_nextMCSTF;
+    }
+    else
+    {
+        m_start = m_end = NULL;
+    }
+
+    curFrame.m_nextMCSTF = curFrame.m_prevMCSTF = NULL;
+}

 
@@ -45,6 +45,25 @@
     m_count++;
 }
 
+void PicList::pushFrontMCSTF(Frame& curFrame)
+{
+    X265_CHECK(!curFrame.m_nextMCSTF && !curFrame.m_nextMCSTF, "piclist: picture already in OPB list\n"); // ensure frame is not in a list
+    curFrame.m_nextMCSTF = m_start;
+    curFrame.m_prevMCSTF = NULL;
+
+    if (m_count)
+    {
+        m_start->m_prevMCSTF = &curFrame;
+        m_start = &curFrame;
+    }
+    else
+    {
+        m_start = m_end = &curFrame;
+    }
+    m_count++;
+
+}
+
 void PicList::pushBack(Frame& curFrame)
 {
     X265_CHECK(!curFrame.m_next && !curFrame.m_prev, "piclist: picture already in list\n"); // ensure frame is not in a list
@@ -63,6 +82,24 @@
     m_count++;
 }
 
+void PicList::pushBackMCSTF(Frame& curFrame)
+{
+    X265_CHECK(!curFrame.m_nextMCSTF && !curFrame.m_prevMCSTF, "piclist: picture already in OPB list\n"); // ensure frame is not in a list
+    curFrame.m_nextMCSTF = NULL;
+    curFrame.m_prevMCSTF = m_end;
+
+    if (m_count)
+    {
+        m_end->m_nextMCSTF = &curFrame;
+        m_end = &curFrame;
+    }
+    else
+    {
+        m_start = m_end = &curFrame;
+    }
+    m_count++;
+}
+
 Frame *PicList::popFront()
 {
     if (m_start)
@@ -94,6 +131,14 @@
     return curFrame;
 }
 
+Frame* PicList::getPOCMCSTF(int poc)
+{
+    Frame *curFrame = m_start;
+    while (curFrame && curFrame->m_poc != poc)
+        curFrame = curFrame->m_nextMCSTF;
+    return curFrame;
+}
+
 Frame *PicList::popBack()
 {
     if (m_end)
@@ -117,6 +162,29 @@
         return NULL;
 }
 
+Frame *PicList::popBackMCSTF()
+{
+    if (m_end)
+    {
+        Frame* temp = m_end;
+        m_count--;
+
+        if (m_count)
+        {
+            m_end = m_end->m_prevMCSTF;
+            m_end->m_nextMCSTF = NULL;
+        }
+        else
+        {
+            m_start = m_end = NULL;
+        }
+        temp->m_nextMCSTF = temp->m_prevMCSTF = NULL;
+        return temp;
+    }
+    else
+        return NULL;
+}
+
 Frame* PicList::getCurFrame(void)
 {
     Frame *curFrame = m_start;
@@ -158,3 +226,36 @@
 
     curFrame.m_next = curFrame.m_prev = NULL;
 }
+
+void PicList::removeMCSTF(Frame& curFrame)
+{
+#if _DEBUG
+    Frame *tmp = m_start;
+    while (tmp && tmp != &curFrame)
+    {
+        tmp = tmp->m_nextMCSTF;
+    }
+
+    X265_CHECK(tmp == &curFrame, "framelist: pic being removed was not in list\n"); // verify pic is in this list
+#endif
+
+    m_count--;
+    if (m_count)
+    {
+        if (m_start == &curFrame)
+            m_start = curFrame.m_nextMCSTF;
+        if (m_end == &curFrame)
+            m_end = curFrame.m_prevMCSTF;
+
+        if (curFrame.m_nextMCSTF)
+            curFrame.m_nextMCSTF->m_prevMCSTF = curFrame.m_prevMCSTF;
+        if (curFrame.m_prevMCSTF)
+            curFrame.m_prevMCSTF->m_nextMCSTF = curFrame.m_nextMCSTF;
+    }
+    else
+    {
+        m_start = m_end = NULL;
+    }
+
+    curFrame.m_nextMCSTF = curFrame.m_prevMCSTF = NULL;
+}
​

x265_3.5.tar.gz/source/common/piclist.h -> x265_3.6.tar.gz/source/common/piclist.h Changed

 
@@ -49,24 +49,31 @@
 
     /** Push picture to end of the list */
     void pushBack(Frame& pic);
+    void pushBackMCSTF(Frame& pic);
 
     /** Push picture to beginning of the list */
     void pushFront(Frame& pic);
+    void pushFrontMCSTF(Frame& pic);
 
     /** Pop picture from end of the list */
     Frame* popBack();
+    Frame* popBackMCSTF();
 
     /** Pop picture from beginning of the list */
     Frame* popFront();
 
     /** Find frame with specified POC */
     Frame* getPOC(int poc);
+    /* Find next MCSTF frame with specified POC */
+    Frame* getPOCMCSTF(int poc);
 
     /** Get the current Frame from the list **/
     Frame* getCurFrame(void);
 
     /** Remove picture from list */
     void remove(Frame& pic);
+    /* Remove MCSTF picture from list */
+    void removeMCSTF(Frame& pic);
 
     Frame* first()        { return m_start;   }
 
​

x265_3.5.tar.gz/source/common/picyuv.cpp -> x265_3.6.tar.gz/source/common/picyuv.cpp Changed

@@ -125,6 +125,58 @@
     return false;
 }
 
+/*Copy pixels from the picture buffer of a frame to picture buffer of another frame*/
+void PicYuv::copyFromFrame(PicYuv* source)
+{
+    uint32_t numCuInHeight = (m_picHeight + m_param->maxCUSize - 1) / m_param->maxCUSize;
+
+    int maxHeight = numCuInHeight * m_param->maxCUSize;
+    memcpy(m_picBuf0, source->m_picBuf0, sizeof(pixel)* m_stride * (maxHeight + (m_lumaMarginY * 2)));
+    m_picOrg0 = m_picBuf0 + m_lumaMarginY * m_stride + m_lumaMarginX;
+
+    if (m_picCsp != X265_CSP_I400)
+    {
+        memcpy(m_picBuf1, source->m_picBuf1, sizeof(pixel)* m_strideC * ((maxHeight >> m_vChromaShift) + (m_chromaMarginY * 2)));
+        memcpy(m_picBuf2, source->m_picBuf2, sizeof(pixel)* m_strideC * ((maxHeight >> m_vChromaShift) + (m_chromaMarginY * 2)));
+
+        m_picOrg1 = m_picBuf1 + m_chromaMarginY * m_strideC + m_chromaMarginX;
+        m_picOrg2 = m_picBuf2 + m_chromaMarginY * m_strideC + m_chromaMarginX;
+    }
+    else
+    {
+        m_picBuf1 = m_picBuf2 = NULL;
+        m_picOrg1 = m_picOrg2 = NULL;
+    }
+}
+
+bool PicYuv::createScaledPicYUV(x265_param* param, uint8_t scaleFactor)
+{
+    m_param = param;
+    m_picWidth = m_param->sourceWidth / scaleFactor;
+    m_picHeight = m_param->sourceHeight / scaleFactor;
+
+    m_picCsp = m_param->internalCsp;
+    m_hChromaShift = CHROMA_H_SHIFT(m_picCsp);
+    m_vChromaShift = CHROMA_V_SHIFT(m_picCsp);
+
+    uint32_t numCuInWidth = (m_picWidth + param->maxCUSize - 1) / param->maxCUSize;
+    uint32_t numCuInHeight = (m_picHeight + param->maxCUSize - 1) / param->maxCUSize;
+
+    m_lumaMarginX = 128; // search margin for L0 and L1 ME in horizontal direction
+    m_lumaMarginY = 128; // search margin for L0 and L1 ME in vertical direction
+    m_stride = (numCuInWidth * param->maxCUSize) + (m_lumaMarginX << 1);
+
+    int maxHeight = numCuInHeight * param->maxCUSize;
+    CHECKED_MALLOC_ZERO(m_picBuf0, pixel, m_stride * (maxHeight + (m_lumaMarginY * 2)));
+    m_picOrg0 = m_picBuf0 + m_lumaMarginY * m_stride + m_lumaMarginX;
+    m_picBuf1 = m_picBuf2 = NULL;
+    m_picOrg1 = m_picOrg2 = NULL;
+    return true;
+
+fail:
+    return false;
+}
+
 int PicYuv::getLumaBufLen(uint32_t picWidth, uint32_t picHeight, uint32_t picCsp)
 {
     m_picWidth = picWidth;

 
@@ -125,6 +125,58 @@
     return false;
 }
 
+/*Copy pixels from the picture buffer of a frame to picture buffer of another frame*/
+void PicYuv::copyFromFrame(PicYuv* source)
+{
+    uint32_t numCuInHeight = (m_picHeight + m_param->maxCUSize - 1) / m_param->maxCUSize;
+
+    int maxHeight = numCuInHeight * m_param->maxCUSize;
+    memcpy(m_picBuf0, source->m_picBuf0, sizeof(pixel)* m_stride * (maxHeight + (m_lumaMarginY * 2)));
+    m_picOrg0 = m_picBuf0 + m_lumaMarginY * m_stride + m_lumaMarginX;
+
+    if (m_picCsp != X265_CSP_I400)
+    {
+        memcpy(m_picBuf1, source->m_picBuf1, sizeof(pixel)* m_strideC * ((maxHeight >> m_vChromaShift) + (m_chromaMarginY * 2)));
+        memcpy(m_picBuf2, source->m_picBuf2, sizeof(pixel)* m_strideC * ((maxHeight >> m_vChromaShift) + (m_chromaMarginY * 2)));
+
+        m_picOrg1 = m_picBuf1 + m_chromaMarginY * m_strideC + m_chromaMarginX;
+        m_picOrg2 = m_picBuf2 + m_chromaMarginY * m_strideC + m_chromaMarginX;
+    }
+    else
+    {
+        m_picBuf1 = m_picBuf2 = NULL;
+        m_picOrg1 = m_picOrg2 = NULL;
+    }
+}
+
+bool PicYuv::createScaledPicYUV(x265_param* param, uint8_t scaleFactor)
+{
+    m_param = param;
+    m_picWidth = m_param->sourceWidth / scaleFactor;
+    m_picHeight = m_param->sourceHeight / scaleFactor;
+
+    m_picCsp = m_param->internalCsp;
+    m_hChromaShift = CHROMA_H_SHIFT(m_picCsp);
+    m_vChromaShift = CHROMA_V_SHIFT(m_picCsp);
+
+    uint32_t numCuInWidth = (m_picWidth + param->maxCUSize - 1) / param->maxCUSize;
+    uint32_t numCuInHeight = (m_picHeight + param->maxCUSize - 1) / param->maxCUSize;
+
+    m_lumaMarginX = 128; // search margin for L0 and L1 ME in horizontal direction
+    m_lumaMarginY = 128; // search margin for L0 and L1 ME in vertical direction
+    m_stride = (numCuInWidth * param->maxCUSize) + (m_lumaMarginX << 1);
+
+    int maxHeight = numCuInHeight * param->maxCUSize;
+    CHECKED_MALLOC_ZERO(m_picBuf0, pixel, m_stride * (maxHeight + (m_lumaMarginY * 2)));
+    m_picOrg0 = m_picBuf0 + m_lumaMarginY * m_stride + m_lumaMarginX;
+    m_picBuf1 = m_picBuf2 = NULL;
+    m_picOrg1 = m_picOrg2 = NULL;
+    return true;
+
+fail:
+    return false;
+}
+
 int PicYuv::getLumaBufLen(uint32_t picWidth, uint32_t picHeight, uint32_t picCsp)
 {
     m_picWidth = picWidth;
​

x265_3.5.tar.gz/source/common/picyuv.h -> x265_3.6.tar.gz/source/common/picyuv.h Changed

 
@@ -78,11 +78,13 @@
     PicYuv();
 
     bool  create(x265_param* param, bool picAlloc = true, pixel *pixelbuf = NULL);
+    bool  createScaledPicYUV(x265_param* param, uint8_t scaleFactor);
     bool  createOffsets(const SPS& sps);
     void  destroy();
     int   getLumaBufLen(uint32_t picWidth, uint32_t picHeight, uint32_t picCsp);
 
     void  copyFromPicture(const x265_picture&, const x265_param& param, int padx, int pady);
+    void  copyFromFrame(PicYuv* source);
 
     intptr_t getChromaAddrOffset(uint32_t ctuAddr, uint32_t absPartIdx) const { return m_cuOffsetCctuAddr + m_buOffsetCabsPartIdx; }
 
​

x265_3.5.tar.gz/source/common/pixel.cpp -> x265_3.6.tar.gz/source/common/pixel.cpp Changed

@@ -266,7 +266,7 @@
 {
     int satd = 0;
 
-#if ENABLE_ASSEMBLY && X265_ARCH_ARM64
+#if ENABLE_ASSEMBLY && X265_ARCH_ARM64 && !HIGH_BIT_DEPTH
     pixelcmp_t satd_4x4 = x265_pixel_satd_4x4_neon;
 #endif
 
@@ -284,7 +284,7 @@
 {
     int satd = 0;
 
-#if ENABLE_ASSEMBLY && X265_ARCH_ARM64
+#if ENABLE_ASSEMBLY && X265_ARCH_ARM64 && !HIGH_BIT_DEPTH
     pixelcmp_t satd_8x4 = x265_pixel_satd_8x4_neon;
 #endif
 
@@ -627,6 +627,23 @@
     }
 }
 
+static
+void frame_subsample_luma(const pixel* src0, pixel* dst0, intptr_t src_stride, intptr_t dst_stride, int width, int height)
+{
+    for (int y = 0; y < height; y++, src0 += 2 * src_stride, dst0 += dst_stride)
+    {
+        const pixel *inRow = src0;
+        const pixel *inRowBelow = src0 + src_stride;
+        pixel *target = dst0;
+        for (int x = 0; x < width; x++)
+        {
+            targetx = (((inRow0 + inRowBelow0 + 1) >> 1) + ((inRow1 + inRowBelow1 + 1) >> 1) + 1) >> 1;
+            inRow += 2;
+            inRowBelow += 2;
+        }
+    }
+}
+
 /* structural similarity metric */
 static void ssim_4x4x2_core(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums24)
 {
@@ -1355,5 +1372,7 @@
     p.cuBLOCK_16x16.normFact = normFact_c;
     p.cuBLOCK_32x32.normFact = normFact_c;
     p.cuBLOCK_64x64.normFact = normFact_c;
+    /* SubSample Luma*/
+    p.frameSubSampleLuma = frame_subsample_luma;
 }
 }

 
@@ -266,7 +266,7 @@
 {
     int satd = 0;
 
-#if ENABLE_ASSEMBLY && X265_ARCH_ARM64
+#if ENABLE_ASSEMBLY && X265_ARCH_ARM64 && !HIGH_BIT_DEPTH
     pixelcmp_t satd_4x4 = x265_pixel_satd_4x4_neon;
 #endif
 
@@ -284,7 +284,7 @@
 {
     int satd = 0;
 
-#if ENABLE_ASSEMBLY && X265_ARCH_ARM64
+#if ENABLE_ASSEMBLY && X265_ARCH_ARM64 && !HIGH_BIT_DEPTH
     pixelcmp_t satd_8x4 = x265_pixel_satd_8x4_neon;
 #endif
 
@@ -627,6 +627,23 @@
     }
 }
 
+static
+void frame_subsample_luma(const pixel* src0, pixel* dst0, intptr_t src_stride, intptr_t dst_stride, int width, int height)
+{
+    for (int y = 0; y < height; y++, src0 += 2 * src_stride, dst0 += dst_stride)
+    {
+        const pixel *inRow = src0;
+        const pixel *inRowBelow = src0 + src_stride;
+        pixel *target = dst0;
+        for (int x = 0; x < width; x++)
+        {
+            targetx = (((inRow0 + inRowBelow0 + 1) >> 1) + ((inRow1 + inRowBelow1 + 1) >> 1) + 1) >> 1;
+            inRow += 2;
+            inRowBelow += 2;
+        }
+    }
+}
+
 /* structural similarity metric */
 static void ssim_4x4x2_core(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums24)
 {
@@ -1355,5 +1372,7 @@
     p.cuBLOCK_16x16.normFact = normFact_c;
     p.cuBLOCK_32x32.normFact = normFact_c;
     p.cuBLOCK_64x64.normFact = normFact_c;
+    /* SubSample Luma*/
+    p.frameSubSampleLuma = frame_subsample_luma;
 }
 }
​

x265_3.5.tar.gz/source/common/ppc/intrapred_altivec.cpp -> x265_3.6.tar.gz/source/common/ppc/intrapred_altivec.cpp Changed

 
@@ -27,7 +27,7 @@
 #include <assert.h>
 #include <math.h>
 #include <cmath>
-#include <linux/types.h>
+#include <sys/types.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <stdint.h>
​

x265_3.5.tar.gz/source/common/primitives.h -> x265_3.6.tar.gz/source/common/primitives.h Changed

@@ -232,6 +232,8 @@
 typedef void(*psyRdoQuant_t2)(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos);
 typedef void(*ssimDistortion_t)(const pixel *fenc, uint32_t fStride, const pixel *recon,  intptr_t rstride, uint64_t *ssBlock, int shift, uint64_t *ac_k);
 typedef void(*normFactor_t)(const pixel *src, uint32_t blockSize, int shift, uint64_t *z_k);
+/* SubSampling Luma */
+typedef void (*downscaleluma_t)(const pixel* src0, pixel* dstf, intptr_t src_stride, intptr_t dst_stride, int width, int height);
 /* Function pointers to optimized encoder primitives. Each pointer can reference
  * either an assembly routine, a SIMD intrinsic primitive, or a C function */
 struct EncoderPrimitives
@@ -353,6 +355,8 @@
 
     downscale_t           frameInitLowres;
     downscale_t           frameInitLowerRes;
+    /* Sub Sample Luma */
+    downscaleluma_t        frameSubSampleLuma;
     cutree_propagate_cost propagateCost;
     cutree_fix8_unpack    fix8Unpack;
     cutree_fix8_pack      fix8Pack;
@@ -488,7 +492,7 @@
 
 #if ENABLE_ASSEMBLY && X265_ARCH_ARM64
 extern "C" {
-#include "aarch64/pixel-util.h"
+#include "aarch64/fun-decls.h"
 }
 #endif

 
@@ -232,6 +232,8 @@
 typedef void(*psyRdoQuant_t2)(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos);
 typedef void(*ssimDistortion_t)(const pixel *fenc, uint32_t fStride, const pixel *recon,  intptr_t rstride, uint64_t *ssBlock, int shift, uint64_t *ac_k);
 typedef void(*normFactor_t)(const pixel *src, uint32_t blockSize, int shift, uint64_t *z_k);
+/* SubSampling Luma */
+typedef void (*downscaleluma_t)(const pixel* src0, pixel* dstf, intptr_t src_stride, intptr_t dst_stride, int width, int height);
 /* Function pointers to optimized encoder primitives. Each pointer can reference
  * either an assembly routine, a SIMD intrinsic primitive, or a C function */
 struct EncoderPrimitives
@@ -353,6 +355,8 @@
 
     downscale_t           frameInitLowres;
     downscale_t           frameInitLowerRes;
+    /* Sub Sample Luma */
+    downscaleluma_t        frameSubSampleLuma;
     cutree_propagate_cost propagateCost;
     cutree_fix8_unpack    fix8Unpack;
     cutree_fix8_pack      fix8Pack;
@@ -488,7 +492,7 @@
 
 #if ENABLE_ASSEMBLY && X265_ARCH_ARM64
 extern "C" {
-#include "aarch64/pixel-util.h"
+#include "aarch64/fun-decls.h"
 }
 #endif
 
​

x265_3.6.tar.gz/source/common/ringmem.cpp Added

@@ -0,0 +1,357 @@
+/*****************************************************************************
+ * Copyright (C) 2013-2017 MulticoreWare, Inc
+ *
+ * Authors: liwei <liwei@multicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com
+ *****************************************************************************/
+
+#include "ringmem.h"
+
+#ifndef _WIN32
+#include <sys/mman.h>
+#endif ////< _WIN32
+
+#ifdef _WIN32
+#define X265_SHARED_MEM_NAME                    "Local\\_x265_shr_mem_"
+#define X265_SEMAPHORE_RINGMEM_WRITER_NAME	    "_x265_semW_"
+#define X265_SEMAPHORE_RINGMEM_READER_NAME	    "_x265_semR_"
+#else /* POSIX / pthreads */
+#define X265_SHARED_MEM_NAME                    "/tmp/_x265_shr_mem_"
+#define X265_SEMAPHORE_RINGMEM_WRITER_NAME	    "/tmp/_x265_semW_"
+#define X265_SEMAPHORE_RINGMEM_READER_NAME	    "/tmp/_x265_semR_"
+#endif
+
+#define RINGMEM_ALLIGNMENT                       64
+
+namespace X265_NS {
+    RingMem::RingMem() 
+        : m_initialized(false)
+        , m_protectRW(false)
+        , m_itemSize(0)
+        , m_itemCnt(0)
+        , m_dataPool(NULL)
+        , m_shrMem(NULL)
+#ifdef _WIN32
+        , m_handle(NULL)
+#else //_WIN32
+        , m_filepath(NULL)
+#endif //_WIN32
+        , m_writeSem(NULL)
+        , m_readSem(NULL)
+    {
+    }
+
+
+    RingMem::~RingMem()
+    {
+    }
+
+    bool RingMem::skipRead(int32_t cnt) {
+        if (!m_initialized)
+        {
+            return false;
+        }
+
+        if (m_protectRW)
+        {
+            for (int i = 0; i < cnt; i++)
+            {
+                m_readSem->take();
+            }
+        }
+        
+        ATOMIC_ADD(&m_shrMem->m_read, cnt);
+
+        if (m_protectRW)
+        {
+            m_writeSem->give(cnt);
+        }
+
+        return true;
+    }
+
+    bool RingMem::skipWrite(int32_t cnt) {
+        if (!m_initialized)
+        {
+            return false;
+        }
+
+        if (m_protectRW)
+        {
+            for (int i = 0; i < cnt; i++)
+            {
+                m_writeSem->take();
+            }
+        }
+
+        ATOMIC_ADD(&m_shrMem->m_write, cnt);
+
+        if (m_protectRW)
+        {
+            m_readSem->give(cnt);
+        }
+
+        return true;
+    }
+
+    ///< initialize
+    bool RingMem::init(int32_t itemSize, int32_t itemCnt, const char *name, bool protectRW)
+    {
+        ///< check parameters
+        if (itemSize <= 0 || itemCnt <= 0 || NULL == name)
+        {
+            ///< invalid parameters 
+            return false;
+        }
+
+        if (!m_initialized)
+        {
+            ///< formating names
+            char nameBufMAX_SHR_NAME_LEN = { 0 };
+
+            ///< shared memory name
+            snprintf(nameBuf, sizeof(nameBuf) - 1, "%s%s", X265_SHARED_MEM_NAME, name);
+
+            ///< create or open shared memory
+            bool newCreated = false;
+
+            ///< calculate the size of the shared memory
+            int32_t shrMemSize = (itemSize * itemCnt + sizeof(ShrMemCtrl) + RINGMEM_ALLIGNMENT - 1) & ~(RINGMEM_ALLIGNMENT - 1);
+
+#ifdef _WIN32
+            HANDLE h = OpenFileMappingA(FILE_MAP_WRITE | FILE_MAP_READ, FALSE, nameBuf);
+            if (!h)
+            {
+                h = CreateFileMappingA(INVALID_HANDLE_VALUE, NULL, PAGE_READWRITE, 0, shrMemSize, nameBuf);
+
+                if (!h)
+                {
+                    return false;
+                }
+
+                newCreated = true;
+            }
+
+            void *pool = MapViewOfFile(h, FILE_MAP_ALL_ACCESS, 0, 0, 0);
+
+            ///< should not close the handle here, otherwise the OpenFileMapping would fail
+            //CloseHandle(h);
+            m_handle = h;
+
+            if (!pool)
+            {
+                return false;
+            }
+
+#else /* POSIX / pthreads */
+            mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH;
+            int flag = O_RDWR;
+            int shrfd = -1;
+            if ((shrfd = open(nameBuf, flag, mode)) < 0)
+            {
+                flag |= O_CREAT;
+                
+                shrfd = open(nameBuf, flag, mode);
+                if (shrfd < 0)
+                {
+                    return false;
+                }
+                newCreated = true;
+
+                lseek(shrfd, shrMemSize - 1, SEEK_SET);
+
+                if (-1 == write(shrfd, "\0", 1))
+                {
+                    close(shrfd);
+                    return false;
+                }
+
+                if (lseek(shrfd, 0, SEEK_END) < shrMemSize)
+                {
+                    close(shrfd);
+                    return false;
+                }
+            }
+
+            void *pool = mmap(0,
+                shrMemSize,
+                PROT_READ | PROT_WRITE,
+                MAP_SHARED,
+                shrfd,
+                0);
+
+            close(shrfd);

 
@@ -0,0 +1,357 @@
+/*****************************************************************************
+ * Copyright (C) 2013-2017 MulticoreWare, Inc
+ *
+ * Authors: liwei <liwei@multicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com
+ *****************************************************************************/
+
+#include "ringmem.h"
+
+#ifndef _WIN32
+#include <sys/mman.h>
+#endif ////< _WIN32
+
+#ifdef _WIN32
+#define X265_SHARED_MEM_NAME                    "Local\\_x265_shr_mem_"
+#define X265_SEMAPHORE_RINGMEM_WRITER_NAME     "_x265_semW_"
+#define X265_SEMAPHORE_RINGMEM_READER_NAME     "_x265_semR_"
+#else /* POSIX / pthreads */
+#define X265_SHARED_MEM_NAME                    "/tmp/_x265_shr_mem_"
+#define X265_SEMAPHORE_RINGMEM_WRITER_NAME     "/tmp/_x265_semW_"
+#define X265_SEMAPHORE_RINGMEM_READER_NAME     "/tmp/_x265_semR_"
+#endif
+
+#define RINGMEM_ALLIGNMENT                       64
+
+namespace X265_NS {
+    RingMem::RingMem() 
+        : m_initialized(false)
+        , m_protectRW(false)
+        , m_itemSize(0)
+        , m_itemCnt(0)
+        , m_dataPool(NULL)
+        , m_shrMem(NULL)
+#ifdef _WIN32
+        , m_handle(NULL)
+#else //_WIN32
+        , m_filepath(NULL)
+#endif //_WIN32
+        , m_writeSem(NULL)
+        , m_readSem(NULL)
+    {
+    }
+
+
+    RingMem::~RingMem()
+    {
+    }
+
+    bool RingMem::skipRead(int32_t cnt) {
+        if (!m_initialized)
+        {
+            return false;
+        }
+
+        if (m_protectRW)
+        {
+            for (int i = 0; i < cnt; i++)
+            {
+                m_readSem->take();
+            }
+        }
+        
+        ATOMIC_ADD(&m_shrMem->m_read, cnt);
+
+        if (m_protectRW)
+        {
+            m_writeSem->give(cnt);
+        }
+
+        return true;
+    }
+
+    bool RingMem::skipWrite(int32_t cnt) {
+        if (!m_initialized)
+        {
+            return false;
+        }
+
+        if (m_protectRW)
+        {
+            for (int i = 0; i < cnt; i++)
+            {
+                m_writeSem->take();
+            }
+        }
+
+        ATOMIC_ADD(&m_shrMem->m_write, cnt);
+
+        if (m_protectRW)
+        {
+            m_readSem->give(cnt);
+        }
+
+        return true;
+    }
+
+    ///< initialize
+    bool RingMem::init(int32_t itemSize, int32_t itemCnt, const char *name, bool protectRW)
+    {
+        ///< check parameters
+        if (itemSize <= 0 || itemCnt <= 0 || NULL == name)
+        {
+            ///< invalid parameters 
+            return false;
+        }
+
+        if (!m_initialized)
+        {
+            ///< formating names
+            char nameBufMAX_SHR_NAME_LEN = { 0 };
+
+            ///< shared memory name
+            snprintf(nameBuf, sizeof(nameBuf) - 1, "%s%s", X265_SHARED_MEM_NAME, name);
+
+            ///< create or open shared memory
+            bool newCreated = false;
+
+            ///< calculate the size of the shared memory
+            int32_t shrMemSize = (itemSize * itemCnt + sizeof(ShrMemCtrl) + RINGMEM_ALLIGNMENT - 1) & ~(RINGMEM_ALLIGNMENT - 1);
+
+#ifdef _WIN32
+            HANDLE h = OpenFileMappingA(FILE_MAP_WRITE | FILE_MAP_READ, FALSE, nameBuf);
+            if (!h)
+            {
+                h = CreateFileMappingA(INVALID_HANDLE_VALUE, NULL, PAGE_READWRITE, 0, shrMemSize, nameBuf);
+
+                if (!h)
+                {
+                    return false;
+                }
+
+                newCreated = true;
+            }
+
+            void *pool = MapViewOfFile(h, FILE_MAP_ALL_ACCESS, 0, 0, 0);
+
+            ///< should not close the handle here, otherwise the OpenFileMapping would fail
+            //CloseHandle(h);
+            m_handle = h;
+
+            if (!pool)
+            {
+                return false;
+            }
+
+#else /* POSIX / pthreads */
+            mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH;
+            int flag = O_RDWR;
+            int shrfd = -1;
+            if ((shrfd = open(nameBuf, flag, mode)) < 0)
+            {
+                flag |= O_CREAT;
+                
+                shrfd = open(nameBuf, flag, mode);
+                if (shrfd < 0)
+                {
+                    return false;
+                }
+                newCreated = true;
+
+                lseek(shrfd, shrMemSize - 1, SEEK_SET);
+
+                if (-1 == write(shrfd, "\0", 1))
+                {
+                    close(shrfd);
+                    return false;
+                }
+
+                if (lseek(shrfd, 0, SEEK_END) < shrMemSize)
+                {
+                    close(shrfd);
+                    return false;
+                }
+            }
+
+            void *pool = mmap(0,
+                shrMemSize,
+                PROT_READ | PROT_WRITE,
+                MAP_SHARED,
+                shrfd,
+                0);
+
+            close(shrfd);
​

x265_3.6.tar.gz/source/common/ringmem.h Added

@@ -0,0 +1,90 @@
+/*****************************************************************************
+ * Copyright (C) 2013-2017 MulticoreWare, Inc
+ *
+ * Authors: liwei <liwei@multicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com
+ *****************************************************************************/
+
+#ifndef X265_RINGMEM_H
+#define X265_RINGMEM_H
+
+#include "common.h"
+#include "threading.h"
+
+#if _MSC_VER
+#define snprintf _snprintf
+#define strdup _strdup
+#endif
+
+namespace X265_NS {
+
+#define MAX_SHR_NAME_LEN                         256
+
+    class RingMem {
+    public:
+        RingMem();
+        ~RingMem();
+
+        bool skipRead(int32_t cnt);
+
+        bool skipWrite(int32_t cnt);
+
+        ///< initialize
+        ///< protectRW: if use the semaphore the protect the write and read operation.
+        bool init(int32_t itemSize, int32_t itemCnt, const char *name, bool protectRW = false);
+        ///< finalize
+        void release();
+
+        typedef void(*fnRWSharedData)(void *dst, void *src, int32_t size);
+
+        ///< data read
+        bool readNext(void* dst, fnRWSharedData callback);
+        ///< data write
+        bool writeData(void *data, fnRWSharedData callback);
+
+    private:        
+        bool    m_initialized;
+        bool    m_protectRW;
+
+        int32_t m_itemSize;
+        int32_t m_itemCnt;
+        ///< data pool
+        void   *m_dataPool;
+        typedef struct {
+            ///< index to write
+            int32_t m_write;
+            ///< index to read
+            int32_t m_read;
+            
+        }ShrMemCtrl;
+
+        ShrMemCtrl *m_shrMem;
+#ifdef _WIN32
+        void       *m_handle;
+#else // _WIN32
+        char       *m_filepath;
+#endif // _WIN32
+
+        ///< Semaphores
+        NamedSemaphore *m_writeSem;
+        NamedSemaphore *m_readSem;
+    };
+};
+
+#endif // ifndef X265_RINGMEM_H

 
@@ -0,0 +1,90 @@
+/*****************************************************************************
+ * Copyright (C) 2013-2017 MulticoreWare, Inc
+ *
+ * Authors: liwei <liwei@multicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com
+ *****************************************************************************/
+
+#ifndef X265_RINGMEM_H
+#define X265_RINGMEM_H
+
+#include "common.h"
+#include "threading.h"
+
+#if _MSC_VER
+#define snprintf _snprintf
+#define strdup _strdup
+#endif
+
+namespace X265_NS {
+
+#define MAX_SHR_NAME_LEN                         256
+
+    class RingMem {
+    public:
+        RingMem();
+        ~RingMem();
+
+        bool skipRead(int32_t cnt);
+
+        bool skipWrite(int32_t cnt);
+
+        ///< initialize
+        ///< protectRW: if use the semaphore the protect the write and read operation.
+        bool init(int32_t itemSize, int32_t itemCnt, const char *name, bool protectRW = false);
+        ///< finalize
+        void release();
+
+        typedef void(*fnRWSharedData)(void *dst, void *src, int32_t size);
+
+        ///< data read
+        bool readNext(void* dst, fnRWSharedData callback);
+        ///< data write
+        bool writeData(void *data, fnRWSharedData callback);
+
+    private:        
+        bool    m_initialized;
+        bool    m_protectRW;
+
+        int32_t m_itemSize;
+        int32_t m_itemCnt;
+        ///< data pool
+        void   *m_dataPool;
+        typedef struct {
+            ///< index to write
+            int32_t m_write;
+            ///< index to read
+            int32_t m_read;
+            
+        }ShrMemCtrl;
+
+        ShrMemCtrl *m_shrMem;
+#ifdef _WIN32
+        void       *m_handle;
+#else // _WIN32
+        char       *m_filepath;
+#endif // _WIN32
+
+        ///< Semaphores
+        NamedSemaphore *m_writeSem;
+        NamedSemaphore *m_readSem;
+    };
+};
+
+#endif // ifndef X265_RINGMEM_H
​

x265_3.5.tar.gz/source/common/slice.h -> x265_3.6.tar.gz/source/common/slice.h Changed

@@ -156,9 +156,9 @@
     HRDInfo          hrdParameters;
     ProfileTierLevel ptl;
     uint32_t         maxTempSubLayers;
-    uint32_t         numReorderPics;
-    uint32_t         maxDecPicBuffering;
-    uint32_t         maxLatencyIncrease;
+    uint32_t         numReorderPicsMAX_T_LAYERS;
+    uint32_t         maxDecPicBufferingMAX_T_LAYERS;
+    uint32_t         maxLatencyIncreaseMAX_T_LAYERS;
 };
 
 struct Window
@@ -235,9 +235,9 @@
     uint32_t maxAMPDepth;
 
     uint32_t maxTempSubLayers;   // max number of Temporal Sub layers
-    uint32_t maxDecPicBuffering; // these are dups of VPS values
-    uint32_t maxLatencyIncrease;
-    int      numReorderPics;
+    uint32_t maxDecPicBufferingMAX_T_LAYERS; // these are dups of VPS values
+    uint32_t maxLatencyIncreaseMAX_T_LAYERS;
+    int      numReorderPicsMAX_T_LAYERS;
 
     RPS      spsrpsMAX_NUM_SHORT_TERM_RPS;
     int      spsrpsNum;
@@ -363,6 +363,7 @@
     int         m_iNumRPSInSPS;
     const x265_param *m_param;
     int         m_fieldNum;
+    Frame*      m_mcstfRefFrameList2MAX_MCSTF_TEMPORAL_WINDOW_LENGTH;
 
     Slice()
     {

 
@@ -156,9 +156,9 @@
     HRDInfo          hrdParameters;
     ProfileTierLevel ptl;
     uint32_t         maxTempSubLayers;
-    uint32_t         numReorderPics;
-    uint32_t         maxDecPicBuffering;
-    uint32_t         maxLatencyIncrease;
+    uint32_t         numReorderPicsMAX_T_LAYERS;
+    uint32_t         maxDecPicBufferingMAX_T_LAYERS;
+    uint32_t         maxLatencyIncreaseMAX_T_LAYERS;
 };
 
 struct Window
@@ -235,9 +235,9 @@
     uint32_t maxAMPDepth;
 
     uint32_t maxTempSubLayers;   // max number of Temporal Sub layers
-    uint32_t maxDecPicBuffering; // these are dups of VPS values
-    uint32_t maxLatencyIncrease;
-    int      numReorderPics;
+    uint32_t maxDecPicBufferingMAX_T_LAYERS; // these are dups of VPS values
+    uint32_t maxLatencyIncreaseMAX_T_LAYERS;
+    int      numReorderPicsMAX_T_LAYERS;
 
     RPS      spsrpsMAX_NUM_SHORT_TERM_RPS;
     int      spsrpsNum;
@@ -363,6 +363,7 @@
     int         m_iNumRPSInSPS;
     const x265_param *m_param;
     int         m_fieldNum;
+    Frame*      m_mcstfRefFrameList2MAX_MCSTF_TEMPORAL_WINDOW_LENGTH;
 
     Slice()
     {
​

x265_3.6.tar.gz/source/common/temporalfilter.cpp Added

@@ -0,0 +1,1017 @@
+/*****************************************************************************
+* Copyright (C) 2013-2021 MulticoreWare, Inc
+*
+ * Authors: Ashok Kumar Mishra <ashok@multicorewareinc.com>
+ *
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by
+* the Free Software Foundation; either version 2 of the License, or
+* (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+*
+* This program is also available under a commercial proprietary license.
+* For more information, contact us at license @ x265.com.
+*****************************************************************************/
+#include "common.h"
+#include "temporalfilter.h"
+#include "primitives.h"
+
+#include "frame.h"
+#include "slice.h"
+#include "framedata.h"
+#include "analysis.h"
+
+using namespace X265_NS;
+
+void OrigPicBuffer::addPicture(Frame* inFrame)
+{
+    m_mcstfPicList.pushFrontMCSTF(*inFrame);
+}
+
+void OrigPicBuffer::addEncPicture(Frame* inFrame)
+{
+    m_mcstfOrigPicFreeList.pushFrontMCSTF(*inFrame);
+}
+
+void OrigPicBuffer::addEncPictureToPicList(Frame* inFrame)
+{
+    m_mcstfOrigPicList.pushFrontMCSTF(*inFrame);
+}
+
+OrigPicBuffer::~OrigPicBuffer()
+{
+    while (!m_mcstfOrigPicList.empty())
+    {
+        Frame* curFrame = m_mcstfOrigPicList.popBackMCSTF();
+        curFrame->destroy();
+        delete curFrame;
+    }
+
+    while (!m_mcstfOrigPicFreeList.empty())
+    {
+        Frame* curFrame = m_mcstfOrigPicFreeList.popBackMCSTF();
+        curFrame->destroy();
+        delete curFrame;
+    }
+}
+
+void OrigPicBuffer::setOrigPicList(Frame* inFrame, int frameCnt)
+{
+    Slice* slice = inFrame->m_encData->m_slice;
+    uint8_t j = 0;
+    for (int iterPOC = (inFrame->m_poc - inFrame->m_mcstf->m_range);
+        iterPOC <= (inFrame->m_poc + inFrame->m_mcstf->m_range); iterPOC++)
+    {
+        if (iterPOC != inFrame->m_poc)
+        {
+            if (iterPOC < 0)
+                continue;
+            if (iterPOC >= frameCnt)
+                break;
+
+            Frame *iterFrame = m_mcstfPicList.getPOCMCSTF(iterPOC);
+            X265_CHECK(iterFrame, "Reference frame not found in OPB");
+            if (iterFrame != NULL)
+            {
+                slice->m_mcstfRefFrameList1j = iterFrame;
+                iterFrame->m_refPicCnt1--;
+            }
+
+            iterFrame = m_mcstfOrigPicList.getPOCMCSTF(iterPOC);
+            if (iterFrame != NULL)
+            {
+
+                slice->m_mcstfRefFrameList1j = iterFrame;
+
+                iterFrame->m_refPicCnt1--;
+                Frame *cFrame = m_mcstfOrigPicList.getPOCMCSTF(inFrame->m_poc);
+                X265_CHECK(cFrame, "Reference frame not found in encoded OPB");
+                cFrame->m_refPicCnt1--;
+            }
+            j++;
+        }
+    }
+}
+
+void OrigPicBuffer::recycleOrigPicList()
+{
+    Frame *iterFrame = m_mcstfPicList.first();
+
+    while (iterFrame)
+    {
+        Frame *curFrame = iterFrame;
+        iterFrame = iterFrame->m_nextMCSTF;
+        if (!curFrame->m_refPicCnt1)
+        {
+            m_mcstfPicList.removeMCSTF(*curFrame);
+            iterFrame = m_mcstfPicList.first();
+        }
+    }
+
+    iterFrame = m_mcstfOrigPicList.first();
+
+    while (iterFrame)
+    {
+        Frame *curFrame = iterFrame;
+        iterFrame = iterFrame->m_nextMCSTF;
+        if (!curFrame->m_refPicCnt1)
+        {
+            m_mcstfOrigPicList.removeMCSTF(*curFrame);
+            *curFrame->m_isSubSampled = false;
+            m_mcstfOrigPicFreeList.pushFrontMCSTF(*curFrame);
+            iterFrame = m_mcstfOrigPicList.first();
+        }
+    }
+}
+
+void OrigPicBuffer::addPictureToFreelist(Frame* inFrame)
+{
+    m_mcstfOrigPicFreeList.pushBack(*inFrame);
+}
+
+TemporalFilter::TemporalFilter()
+{
+    m_sourceWidth = 0;
+    m_sourceHeight = 0,
+    m_QP = 0;
+    m_sliceTypeConfig = 3;
+    m_numRef = 0;
+    m_useSADinME = 1;
+
+    m_range = 2;
+    m_chromaFactor = 0.55;
+    m_sigmaMultiplier = 9.0;
+    m_sigmaZeroPoint = 10.0;
+    m_motionVectorFactor = 16;
+}
+
+void TemporalFilter::init(const x265_param* param)
+{
+    m_param = param;
+    m_bitDepth = param->internalBitDepth;
+    m_sourceWidth = param->sourceWidth;
+    m_sourceHeight = param->sourceHeight;
+    m_internalCsp = param->internalCsp;
+    m_numComponents = (m_internalCsp != X265_CSP_I400) ? MAX_NUM_COMPONENT : 1;
+
+    m_metld = new MotionEstimatorTLD;
+
+    predPUYuv.create(FENC_STRIDE, X265_CSP_I400);
+}
+
+int TemporalFilter::createRefPicInfo(TemporalFilterRefPicInfo* refFrame, x265_param* param)
+{
+    CHECKED_MALLOC_ZERO(refFrame->mvs, MV, sizeof(MV)* ((m_sourceWidth ) / 4) * ((m_sourceHeight ) / 4));
+    refFrame->mvsStride = m_sourceWidth / 4;
+    CHECKED_MALLOC_ZERO(refFrame->mvs0, MV, sizeof(MV)* ((m_sourceWidth ) / 16) * ((m_sourceHeight ) / 16));
+    refFrame->mvsStride0 = m_sourceWidth / 16;
+    CHECKED_MALLOC_ZERO(refFrame->mvs1, MV, sizeof(MV)* ((m_sourceWidth ) / 16) * ((m_sourceHeight ) / 16));
+    refFrame->mvsStride1 = m_sourceWidth / 16;
+    CHECKED_MALLOC_ZERO(refFrame->mvs2, MV, sizeof(MV)* ((m_sourceWidth ) / 16)*((m_sourceHeight ) / 16));
+    refFrame->mvsStride2 = m_sourceWidth / 16;
+
+    CHECKED_MALLOC_ZERO(refFrame->noise, int, sizeof(int) * ((m_sourceWidth) / 4) * ((m_sourceHeight) / 4));
+    CHECKED_MALLOC_ZERO(refFrame->error, int, sizeof(int) * ((m_sourceWidth) / 4) * ((m_sourceHeight) / 4));
+
+    refFrame->slicetype = X265_TYPE_AUTO;
+
+    refFrame->compensatedPic = new PicYuv;
+    refFrame->compensatedPic->create(param, true);
+
+    return 1;
+fail:
+    return 0;
+}
+
+int TemporalFilter::motionErrorLumaSAD(
+    PicYuv *orig,
+    PicYuv *buffer,
+    int x,
+    int y,
+    int dx,

 
@@ -0,0 +1,1017 @@
+/*****************************************************************************
+* Copyright (C) 2013-2021 MulticoreWare, Inc
+*
+ * Authors: Ashok Kumar Mishra <ashok@multicorewareinc.com>
+ *
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by
+* the Free Software Foundation; either version 2 of the License, or
+* (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+*
+* This program is also available under a commercial proprietary license.
+* For more information, contact us at license @ x265.com.
+*****************************************************************************/
+#include "common.h"
+#include "temporalfilter.h"
+#include "primitives.h"
+
+#include "frame.h"
+#include "slice.h"
+#include "framedata.h"
+#include "analysis.h"
+
+using namespace X265_NS;
+
+void OrigPicBuffer::addPicture(Frame* inFrame)
+{
+    m_mcstfPicList.pushFrontMCSTF(*inFrame);
+}
+
+void OrigPicBuffer::addEncPicture(Frame* inFrame)
+{
+    m_mcstfOrigPicFreeList.pushFrontMCSTF(*inFrame);
+}
+
+void OrigPicBuffer::addEncPictureToPicList(Frame* inFrame)
+{
+    m_mcstfOrigPicList.pushFrontMCSTF(*inFrame);
+}
+
+OrigPicBuffer::~OrigPicBuffer()
+{
+    while (!m_mcstfOrigPicList.empty())
+    {
+        Frame* curFrame = m_mcstfOrigPicList.popBackMCSTF();
+        curFrame->destroy();
+        delete curFrame;
+    }
+
+    while (!m_mcstfOrigPicFreeList.empty())
+    {
+        Frame* curFrame = m_mcstfOrigPicFreeList.popBackMCSTF();
+        curFrame->destroy();
+        delete curFrame;
+    }
+}
+
+void OrigPicBuffer::setOrigPicList(Frame* inFrame, int frameCnt)
+{
+    Slice* slice = inFrame->m_encData->m_slice;
+    uint8_t j = 0;
+    for (int iterPOC = (inFrame->m_poc - inFrame->m_mcstf->m_range);
+        iterPOC <= (inFrame->m_poc + inFrame->m_mcstf->m_range); iterPOC++)
+    {
+        if (iterPOC != inFrame->m_poc)
+        {
+            if (iterPOC < 0)
+                continue;
+            if (iterPOC >= frameCnt)
+                break;
+
+            Frame *iterFrame = m_mcstfPicList.getPOCMCSTF(iterPOC);
+            X265_CHECK(iterFrame, "Reference frame not found in OPB");
+            if (iterFrame != NULL)
+            {
+                slice->m_mcstfRefFrameList1j = iterFrame;
+                iterFrame->m_refPicCnt1--;
+            }
+
+            iterFrame = m_mcstfOrigPicList.getPOCMCSTF(iterPOC);
+            if (iterFrame != NULL)
+            {
+
+                slice->m_mcstfRefFrameList1j = iterFrame;
+
+                iterFrame->m_refPicCnt1--;
+                Frame *cFrame = m_mcstfOrigPicList.getPOCMCSTF(inFrame->m_poc);
+                X265_CHECK(cFrame, "Reference frame not found in encoded OPB");
+                cFrame->m_refPicCnt1--;
+            }
+            j++;
+        }
+    }
+}
+
+void OrigPicBuffer::recycleOrigPicList()
+{
+    Frame *iterFrame = m_mcstfPicList.first();
+
+    while (iterFrame)
+    {
+        Frame *curFrame = iterFrame;
+        iterFrame = iterFrame->m_nextMCSTF;
+        if (!curFrame->m_refPicCnt1)
+        {
+            m_mcstfPicList.removeMCSTF(*curFrame);
+            iterFrame = m_mcstfPicList.first();
+        }
+    }
+
+    iterFrame = m_mcstfOrigPicList.first();
+
+    while (iterFrame)
+    {
+        Frame *curFrame = iterFrame;
+        iterFrame = iterFrame->m_nextMCSTF;
+        if (!curFrame->m_refPicCnt1)
+        {
+            m_mcstfOrigPicList.removeMCSTF(*curFrame);
+            *curFrame->m_isSubSampled = false;
+            m_mcstfOrigPicFreeList.pushFrontMCSTF(*curFrame);
+            iterFrame = m_mcstfOrigPicList.first();
+        }
+    }
+}
+
+void OrigPicBuffer::addPictureToFreelist(Frame* inFrame)
+{
+    m_mcstfOrigPicFreeList.pushBack(*inFrame);
+}
+
+TemporalFilter::TemporalFilter()
+{
+    m_sourceWidth = 0;
+    m_sourceHeight = 0,
+    m_QP = 0;
+    m_sliceTypeConfig = 3;
+    m_numRef = 0;
+    m_useSADinME = 1;
+
+    m_range = 2;
+    m_chromaFactor = 0.55;
+    m_sigmaMultiplier = 9.0;
+    m_sigmaZeroPoint = 10.0;
+    m_motionVectorFactor = 16;
+}
+
+void TemporalFilter::init(const x265_param* param)
+{
+    m_param = param;
+    m_bitDepth = param->internalBitDepth;
+    m_sourceWidth = param->sourceWidth;
+    m_sourceHeight = param->sourceHeight;
+    m_internalCsp = param->internalCsp;
+    m_numComponents = (m_internalCsp != X265_CSP_I400) ? MAX_NUM_COMPONENT : 1;
+
+    m_metld = new MotionEstimatorTLD;
+
+    predPUYuv.create(FENC_STRIDE, X265_CSP_I400);
+}
+
+int TemporalFilter::createRefPicInfo(TemporalFilterRefPicInfo* refFrame, x265_param* param)
+{
+    CHECKED_MALLOC_ZERO(refFrame->mvs, MV, sizeof(MV)* ((m_sourceWidth ) / 4) * ((m_sourceHeight ) / 4));
+    refFrame->mvsStride = m_sourceWidth / 4;
+    CHECKED_MALLOC_ZERO(refFrame->mvs0, MV, sizeof(MV)* ((m_sourceWidth ) / 16) * ((m_sourceHeight ) / 16));
+    refFrame->mvsStride0 = m_sourceWidth / 16;
+    CHECKED_MALLOC_ZERO(refFrame->mvs1, MV, sizeof(MV)* ((m_sourceWidth ) / 16) * ((m_sourceHeight ) / 16));
+    refFrame->mvsStride1 = m_sourceWidth / 16;
+    CHECKED_MALLOC_ZERO(refFrame->mvs2, MV, sizeof(MV)* ((m_sourceWidth ) / 16)*((m_sourceHeight ) / 16));
+    refFrame->mvsStride2 = m_sourceWidth / 16;
+
+    CHECKED_MALLOC_ZERO(refFrame->noise, int, sizeof(int) * ((m_sourceWidth) / 4) * ((m_sourceHeight) / 4));
+    CHECKED_MALLOC_ZERO(refFrame->error, int, sizeof(int) * ((m_sourceWidth) / 4) * ((m_sourceHeight) / 4));
+
+    refFrame->slicetype = X265_TYPE_AUTO;
+
+    refFrame->compensatedPic = new PicYuv;
+    refFrame->compensatedPic->create(param, true);
+
+    return 1;
+fail:
+    return 0;
+}
+
+int TemporalFilter::motionErrorLumaSAD(
+    PicYuv *orig,
+    PicYuv *buffer,
+    int x,
+    int y,
+    int dx,
​

x265_3.6.tar.gz/source/common/temporalfilter.h Added

@@ -0,0 +1,185 @@
+/*****************************************************************************
+* Copyright (C) 2013-2021 MulticoreWare, Inc
+*
+ * Authors: Ashok Kumar Mishra <ashok@multicorewareinc.com>
+ *
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by
+* the Free Software Foundation; either version 2 of the License, or
+* (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+*
+* This program is also available under a commercial proprietary license.
+* For more information, contact us at license @ x265.com.
+*****************************************************************************/
+
+#ifndef X265_TEMPORAL_FILTER_H
+#define X265_TEMPORAL_FILTER_H
+
+#include "x265.h"
+#include "picyuv.h"
+#include "mv.h"
+#include "piclist.h"
+#include "yuv.h"
+#include "motion.h"
+
+const int s_interpolationFilter168 =
+{
+    {   0,   0,   0,  64,   0,   0,   0,   0 },   //0
+    {   0,   1,  -3,  64,   4,  -2,   0,   0 },   //1 -->-->
+    {   0,   1,  -6,  62,   9,  -3,   1,   0 },   //2 -->
+    {   0,   2,  -8,  60,  14,  -5,   1,   0 },   //3 -->-->
+    {   0,   2,  -9,  57,  19,  -7,   2,   0 },   //4
+    {   0,   3, -10,  53,  24,  -8,   2,   0 },   //5 -->-->
+    {   0,   3, -11,  50,  29,  -9,   2,   0 },   //6 -->
+    {   0,   3, -11,  44,  35, -10,   3,   0 },   //7 -->-->
+    {   0,   1,  -7,  38,  38,  -7,   1,   0 },   //8
+    {   0,   3, -10,  35,  44, -11,   3,   0 },   //9 -->-->
+    {   0,   2,  -9,  29,  50, -11,   3,   0 },   //10-->
+    {   0,   2,  -8,  24,  53, -10,   3,   0 },   //11-->-->
+    {   0,   2,  -7,  19,  57,  -9,   2,   0 },   //12
+    {   0,   1,  -5,  14,  60,  -8,   2,   0 },   //13-->-->
+    {   0,   1,  -3,   9,  62,  -6,   1,   0 },   //14-->
+    {   0,   0,  -2,   4,  64,  -3,   1,   0 }    //15-->-->
+};
+
+const double s_refStrengths34 =
+{ // abs(POC offset)
+  //  1,    2     3     4
+  {0.85, 0.57, 0.41, 0.33},  // m_range * 2
+  {1.13, 0.97, 0.81, 0.57},  // m_range
+  {0.30, 0.30, 0.30, 0.30}   // otherwise
+};
+
+namespace X265_NS {
+    class OrigPicBuffer
+    {
+    public:
+        PicList    m_mcstfPicList;
+        PicList    m_mcstfOrigPicFreeList;
+        PicList    m_mcstfOrigPicList;
+
+        ~OrigPicBuffer();
+        void addPicture(Frame*);
+        void addEncPicture(Frame*);
+        void setOrigPicList(Frame*, int);
+        void recycleOrigPicList();
+        void addPictureToFreelist(Frame*);
+        void addEncPictureToPicList(Frame*);
+    };
+
+    struct MotionEstimatorTLD
+    {
+        MotionEstimate  me;
+
+        MotionEstimatorTLD()
+        {
+            me.init(X265_CSP_I400);
+            me.setQP(X265_LOOKAHEAD_QP);
+        }
+
+        ~MotionEstimatorTLD() {}
+    };
+
+    struct TemporalFilterRefPicInfo
+    {
+        PicYuv*    picBuffer;
+        PicYuv*    picBufferSubSampled2;
+        PicYuv*    picBufferSubSampled4;
+        MV*        mvs;
+        MV*        mvs0;
+        MV*        mvs1;
+        MV*        mvs2;
+        uint32_t   mvsStride;
+        uint32_t   mvsStride0;
+        uint32_t   mvsStride1;
+        uint32_t   mvsStride2;
+        int*       error;
+        int*       noise;
+
+        int16_t    origOffset;
+        bool       isFilteredFrame;
+        PicYuv*    compensatedPic;
+
+        int*       isSubsampled;
+
+        int        slicetype;
+    };
+
+    class TemporalFilter
+    {
+    public:
+        TemporalFilter();
+        ~TemporalFilter() {}
+
+        void init(const x265_param* param);
+
+        //private:
+            // Private static member variables
+        const x265_param *m_param;
+        int32_t  m_bitDepth;
+        int m_range;
+        uint8_t m_numRef;
+        double m_chromaFactor;
+        double m_sigmaMultiplier;
+        double m_sigmaZeroPoint;
+        int m_motionVectorFactor;
+        int m_padding;
+
+        // Private member variables
+
+        int m_sourceWidth;
+        int m_sourceHeight;
+        int m_QP;
+
+        int m_internalCsp;
+        int m_numComponents;
+        uint8_t m_sliceTypeConfig;
+
+        MotionEstimatorTLD* m_metld;
+        Yuv  predPUYuv;
+        int m_useSADinME;
+
+        int createRefPicInfo(TemporalFilterRefPicInfo* refFrame, x265_param* param);
+
+        void bilateralFilter(Frame* frame, TemporalFilterRefPicInfo* mctfRefList, double overallStrength);
+
+        void motionEstimationLuma(MV *mvs, uint32_t mvStride, PicYuv *orig, PicYuv *buffer, int bs,
+            MV *previous = 0, uint32_t prevmvStride = 0, int factor = 1);
+
+        void motionEstimationLumaDoubleRes(MV *mvs, uint32_t mvStride, PicYuv *orig, PicYuv *buffer, int blockSize,
+            MV *previous, uint32_t prevMvStride, int factor, int* minError);
+
+        int motionErrorLumaSSD(PicYuv *orig,
+            PicYuv *buffer,
+            int x,
+            int y,
+            int dx,
+            int dy,
+            int bs,
+            int besterror = 8 * 8 * 1024 * 1024);
+
+        int motionErrorLumaSAD(PicYuv *orig,
+            PicYuv *buffer,
+            int x,
+            int y,
+            int dx,
+            int dy,
+            int bs,
+            int besterror = 8 * 8 * 1024 * 1024);
+
+        void destroyRefPicInfo(TemporalFilterRefPicInfo* curFrame);
+
+        void applyMotion(MV *mvs, uint32_t mvsStride, PicYuv *input, PicYuv *output);
+
+    };
+}
+#endif

 
@@ -0,0 +1,185 @@
+/*****************************************************************************
+* Copyright (C) 2013-2021 MulticoreWare, Inc
+*
+ * Authors: Ashok Kumar Mishra <ashok@multicorewareinc.com>
+ *
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by
+* the Free Software Foundation; either version 2 of the License, or
+* (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+*
+* This program is also available under a commercial proprietary license.
+* For more information, contact us at license @ x265.com.
+*****************************************************************************/
+
+#ifndef X265_TEMPORAL_FILTER_H
+#define X265_TEMPORAL_FILTER_H
+
+#include "x265.h"
+#include "picyuv.h"
+#include "mv.h"
+#include "piclist.h"
+#include "yuv.h"
+#include "motion.h"
+
+const int s_interpolationFilter168 =
+{
+    {   0,   0,   0,  64,   0,   0,   0,   0 },   //0
+    {   0,   1,  -3,  64,   4,  -2,   0,   0 },   //1 -->-->
+    {   0,   1,  -6,  62,   9,  -3,   1,   0 },   //2 -->
+    {   0,   2,  -8,  60,  14,  -5,   1,   0 },   //3 -->-->
+    {   0,   2,  -9,  57,  19,  -7,   2,   0 },   //4
+    {   0,   3, -10,  53,  24,  -8,   2,   0 },   //5 -->-->
+    {   0,   3, -11,  50,  29,  -9,   2,   0 },   //6 -->
+    {   0,   3, -11,  44,  35, -10,   3,   0 },   //7 -->-->
+    {   0,   1,  -7,  38,  38,  -7,   1,   0 },   //8
+    {   0,   3, -10,  35,  44, -11,   3,   0 },   //9 -->-->
+    {   0,   2,  -9,  29,  50, -11,   3,   0 },   //10-->
+    {   0,   2,  -8,  24,  53, -10,   3,   0 },   //11-->-->
+    {   0,   2,  -7,  19,  57,  -9,   2,   0 },   //12
+    {   0,   1,  -5,  14,  60,  -8,   2,   0 },   //13-->-->
+    {   0,   1,  -3,   9,  62,  -6,   1,   0 },   //14-->
+    {   0,   0,  -2,   4,  64,  -3,   1,   0 }    //15-->-->
+};
+
+const double s_refStrengths34 =
+{ // abs(POC offset)
+  //  1,    2     3     4
+  {0.85, 0.57, 0.41, 0.33},  // m_range * 2
+  {1.13, 0.97, 0.81, 0.57},  // m_range
+  {0.30, 0.30, 0.30, 0.30}   // otherwise
+};
+
+namespace X265_NS {
+    class OrigPicBuffer
+    {
+    public:
+        PicList    m_mcstfPicList;
+        PicList    m_mcstfOrigPicFreeList;
+        PicList    m_mcstfOrigPicList;
+
+        ~OrigPicBuffer();
+        void addPicture(Frame*);
+        void addEncPicture(Frame*);
+        void setOrigPicList(Frame*, int);
+        void recycleOrigPicList();
+        void addPictureToFreelist(Frame*);
+        void addEncPictureToPicList(Frame*);
+    };
+
+    struct MotionEstimatorTLD
+    {
+        MotionEstimate  me;
+
+        MotionEstimatorTLD()
+        {
+            me.init(X265_CSP_I400);
+            me.setQP(X265_LOOKAHEAD_QP);
+        }
+
+        ~MotionEstimatorTLD() {}
+    };
+
+    struct TemporalFilterRefPicInfo
+    {
+        PicYuv*    picBuffer;
+        PicYuv*    picBufferSubSampled2;
+        PicYuv*    picBufferSubSampled4;
+        MV*        mvs;
+        MV*        mvs0;
+        MV*        mvs1;
+        MV*        mvs2;
+        uint32_t   mvsStride;
+        uint32_t   mvsStride0;
+        uint32_t   mvsStride1;
+        uint32_t   mvsStride2;
+        int*       error;
+        int*       noise;
+
+        int16_t    origOffset;
+        bool       isFilteredFrame;
+        PicYuv*    compensatedPic;
+
+        int*       isSubsampled;
+
+        int        slicetype;
+    };
+
+    class TemporalFilter
+    {
+    public:
+        TemporalFilter();
+        ~TemporalFilter() {}
+
+        void init(const x265_param* param);
+
+        //private:
+            // Private static member variables
+        const x265_param *m_param;
+        int32_t  m_bitDepth;
+        int m_range;
+        uint8_t m_numRef;
+        double m_chromaFactor;
+        double m_sigmaMultiplier;
+        double m_sigmaZeroPoint;
+        int m_motionVectorFactor;
+        int m_padding;
+
+        // Private member variables
+
+        int m_sourceWidth;
+        int m_sourceHeight;
+        int m_QP;
+
+        int m_internalCsp;
+        int m_numComponents;
+        uint8_t m_sliceTypeConfig;
+
+        MotionEstimatorTLD* m_metld;
+        Yuv  predPUYuv;
+        int m_useSADinME;
+
+        int createRefPicInfo(TemporalFilterRefPicInfo* refFrame, x265_param* param);
+
+        void bilateralFilter(Frame* frame, TemporalFilterRefPicInfo* mctfRefList, double overallStrength);
+
+        void motionEstimationLuma(MV *mvs, uint32_t mvStride, PicYuv *orig, PicYuv *buffer, int bs,
+            MV *previous = 0, uint32_t prevmvStride = 0, int factor = 1);
+
+        void motionEstimationLumaDoubleRes(MV *mvs, uint32_t mvStride, PicYuv *orig, PicYuv *buffer, int blockSize,
+            MV *previous, uint32_t prevMvStride, int factor, int* minError);
+
+        int motionErrorLumaSSD(PicYuv *orig,
+            PicYuv *buffer,
+            int x,
+            int y,
+            int dx,
+            int dy,
+            int bs,
+            int besterror = 8 * 8 * 1024 * 1024);
+
+        int motionErrorLumaSAD(PicYuv *orig,
+            PicYuv *buffer,
+            int x,
+            int y,
+            int dx,
+            int dy,
+            int bs,
+            int besterror = 8 * 8 * 1024 * 1024);
+
+        void destroyRefPicInfo(TemporalFilterRefPicInfo* curFrame);
+
+        void applyMotion(MV *mvs, uint32_t mvsStride, PicYuv *input, PicYuv *output);
+
+    };
+}
+#endif
​

x265_3.5.tar.gz/source/common/threading.h -> x265_3.6.tar.gz/source/common/threading.h Changed

@@ -3,6 +3,7 @@
  *
  * Authors: Steve Borho <steve@borho.org>
  *          Min Chen <chenm003@163.com>
+            liwei <liwei@multicorewareinc.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -253,6 +254,47 @@
     int                m_val;
 };
 
+class NamedSemaphore
+{
+public:
+    NamedSemaphore() : m_sem(NULL)
+    {
+    }
+
+    ~NamedSemaphore()
+    {
+    }
+
+    bool create(const char* name, const int initcnt, const int maxcnt)
+    {
+        if(!m_sem)
+        {
+            m_sem = CreateSemaphoreA(NULL, initcnt, maxcnt, name);
+        }
+        return m_sem != NULL;
+    }
+
+    bool give(const int32_t cnt)
+    {
+        return ReleaseSemaphore(m_sem, (LONG)cnt, NULL) != FALSE;
+    }
+
+    bool take(const uint32_t time_out = INFINITE)
+    {
+        int32_t rt = WaitForSingleObject(m_sem, time_out);
+        return rt != WAIT_TIMEOUT && rt != WAIT_FAILED;
+    }
+
+    void release()
+    {
+        CloseHandle(m_sem);
+        m_sem = NULL;
+    }
+
+private:
+    HANDLE m_sem;
+};
+
 #else /* POSIX / pthreads */
 
 typedef pthread_t ThreadHandle;
@@ -459,6 +501,282 @@
     int             m_val;
 };
 
+#define TIMEOUT_INFINITE 0xFFFFFFFF
+
+class NamedSemaphore
+{
+public:
+    NamedSemaphore() 
+        : m_sem(NULL)
+#ifndef __APPLE__
+        , m_name(NULL)
+#endif //__APPLE__
+    {
+    }
+
+    ~NamedSemaphore()
+    {
+    }
+
+    bool create(const char* name, const int initcnt, const int maxcnt)
+    {
+        bool ret = false;
+
+        if (initcnt >= maxcnt)
+        {
+            return false;
+        }
+
+#ifdef __APPLE__
+        do
+        {
+            int32_t pshared = name != NULL ? PTHREAD_PROCESS_SHARED : PTHREAD_PROCESS_PRIVATE;
+
+            m_sem = (mac_sem_t *)malloc(sizeof(mac_sem_t));
+            if (!m_sem)
+            {
+                break;
+            }
+
+            if (pthread_mutexattr_init(&m_sem->mutexAttr))
+            {
+                break;
+            }
+
+            if (pthread_mutexattr_setpshared(&m_sem->mutexAttr, pshared))
+            {
+                break;
+            }
+
+            if (pthread_condattr_init(&m_sem->condAttr))
+            {
+                break;
+            }
+
+            if (pthread_condattr_setpshared(&m_sem->condAttr, pshared))
+            {
+                break;
+            }
+
+            if (pthread_mutex_init(&m_sem->mutex, &m_sem->mutexAttr))
+            {
+                break;
+            }
+
+            if (pthread_cond_init(&m_sem->cond, &m_sem->condAttr))
+            {
+                break;
+            }
+
+            m_sem->curCnt = initcnt;
+            m_sem->maxCnt = maxcnt;
+
+            ret = true;
+        } while (0);
+        
+        if (!ret)
+        {
+            release();
+        }
+
+#else  //__APPLE__
+        m_sem = sem_open(name, O_CREAT | O_EXCL, 0666, initcnt);
+        if (m_sem != SEM_FAILED) 
+        {
+            m_name = strdup(name);
+            ret = true;
+        }
+        else 
+        {
+            if (EEXIST == errno) 
+            {
+                m_sem = sem_open(name, 0);
+                if (m_sem != SEM_FAILED) 
+                {
+                    m_name = strdup(name);
+                    ret = true;
+                }
+            }
+        }
+#endif //__APPLE__
+
+        return ret;
+    }
+
+    bool give(const int32_t cnt)
+    {
+        if (!m_sem)
+        {
+            return false;
+        }
+
+#ifdef __APPLE__
+        if (pthread_mutex_lock(&m_sem->mutex))
+        {
+            return false;
+        }
+
+        int oldCnt = m_sem->curCnt;
+        m_sem->curCnt += cnt;
+        if (m_sem->curCnt > m_sem->maxCnt)
+        {
+            m_sem->curCnt = m_sem->maxCnt;
+        }
+
+        bool ret = true;
+        if (!oldCnt)
+        {
+            ret = 0 == pthread_cond_broadcast(&m_sem->cond);
+        }
+
+        if (pthread_mutex_unlock(&m_sem->mutex))
+        {
+            return false;
+        }
+
+        return ret;
+#else //__APPLE__
+        int ret = 0;
+        int32_t curCnt = cnt;
+        while (curCnt-- && !ret) {
+            ret = sem_post(m_sem);
+        }

 
@@ -3,6 +3,7 @@
  *
  * Authors: Steve Borho <steve@borho.org>
  *          Min Chen <chenm003@163.com>
+            liwei <liwei@multicorewareinc.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -253,6 +254,47 @@
     int                m_val;
 };
 
+class NamedSemaphore
+{
+public:
+    NamedSemaphore() : m_sem(NULL)
+    {
+    }
+
+    ~NamedSemaphore()
+    {
+    }
+
+    bool create(const char* name, const int initcnt, const int maxcnt)
+    {
+        if(!m_sem)
+        {
+            m_sem = CreateSemaphoreA(NULL, initcnt, maxcnt, name);
+        }
+        return m_sem != NULL;
+    }
+
+    bool give(const int32_t cnt)
+    {
+        return ReleaseSemaphore(m_sem, (LONG)cnt, NULL) != FALSE;
+    }
+
+    bool take(const uint32_t time_out = INFINITE)
+    {
+        int32_t rt = WaitForSingleObject(m_sem, time_out);
+        return rt != WAIT_TIMEOUT && rt != WAIT_FAILED;
+    }
+
+    void release()
+    {
+        CloseHandle(m_sem);
+        m_sem = NULL;
+    }
+
+private:
+    HANDLE m_sem;
+};
+
 #else /* POSIX / pthreads */
 
 typedef pthread_t ThreadHandle;
@@ -459,6 +501,282 @@
     int             m_val;
 };
 
+#define TIMEOUT_INFINITE 0xFFFFFFFF
+
+class NamedSemaphore
+{
+public:
+    NamedSemaphore() 
+        : m_sem(NULL)
+#ifndef __APPLE__
+        , m_name(NULL)
+#endif //__APPLE__
+    {
+    }
+
+    ~NamedSemaphore()
+    {
+    }
+
+    bool create(const char* name, const int initcnt, const int maxcnt)
+    {
+        bool ret = false;
+
+        if (initcnt >= maxcnt)
+        {
+            return false;
+        }
+
+#ifdef __APPLE__
+        do
+        {
+            int32_t pshared = name != NULL ? PTHREAD_PROCESS_SHARED : PTHREAD_PROCESS_PRIVATE;
+
+            m_sem = (mac_sem_t *)malloc(sizeof(mac_sem_t));
+            if (!m_sem)
+            {
+                break;
+            }
+
+            if (pthread_mutexattr_init(&m_sem->mutexAttr))
+            {
+                break;
+            }
+
+            if (pthread_mutexattr_setpshared(&m_sem->mutexAttr, pshared))
+            {
+                break;
+            }
+
+            if (pthread_condattr_init(&m_sem->condAttr))
+            {
+                break;
+            }
+
+            if (pthread_condattr_setpshared(&m_sem->condAttr, pshared))
+            {
+                break;
+            }
+
+            if (pthread_mutex_init(&m_sem->mutex, &m_sem->mutexAttr))
+            {
+                break;
+            }
+
+            if (pthread_cond_init(&m_sem->cond, &m_sem->condAttr))
+            {
+                break;
+            }
+
+            m_sem->curCnt = initcnt;
+            m_sem->maxCnt = maxcnt;
+
+            ret = true;
+        } while (0);
+        
+        if (!ret)
+        {
+            release();
+        }
+
+#else  //__APPLE__
+        m_sem = sem_open(name, O_CREAT | O_EXCL, 0666, initcnt);
+        if (m_sem != SEM_FAILED) 
+        {
+            m_name = strdup(name);
+            ret = true;
+        }
+        else 
+        {
+            if (EEXIST == errno) 
+            {
+                m_sem = sem_open(name, 0);
+                if (m_sem != SEM_FAILED) 
+                {
+                    m_name = strdup(name);
+                    ret = true;
+                }
+            }
+        }
+#endif //__APPLE__
+
+        return ret;
+    }
+
+    bool give(const int32_t cnt)
+    {
+        if (!m_sem)
+        {
+            return false;
+        }
+
+#ifdef __APPLE__
+        if (pthread_mutex_lock(&m_sem->mutex))
+        {
+            return false;
+        }
+
+        int oldCnt = m_sem->curCnt;
+        m_sem->curCnt += cnt;
+        if (m_sem->curCnt > m_sem->maxCnt)
+        {
+            m_sem->curCnt = m_sem->maxCnt;
+        }
+
+        bool ret = true;
+        if (!oldCnt)
+        {
+            ret = 0 == pthread_cond_broadcast(&m_sem->cond);
+        }
+
+        if (pthread_mutex_unlock(&m_sem->mutex))
+        {
+            return false;
+        }
+
+        return ret;
+#else //__APPLE__
+        int ret = 0;
+        int32_t curCnt = cnt;
+        while (curCnt-- && !ret) {
+            ret = sem_post(m_sem);
+        }
​

x265_3.5.tar.gz/source/common/threadpool.cpp -> x265_3.6.tar.gz/source/common/threadpool.cpp Changed

 
@@ -301,7 +301,7 @@
     /* limit threads based on param->numaPools
      * For windows because threads can't be allocated to live across sockets
      * changing the default behavior to be per-socket pools -- FIXME */
-#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7
+#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 || HAVE_LIBNUMA
     if (!p->numaPools || (strcmp(p->numaPools, "NULL") == 0 || strcmp(p->numaPools, "*") == 0 || strcmp(p->numaPools, "") == 0))
     {
          char poolString50 = "";
​

x265_3.5.tar.gz/source/common/version.cpp -> x265_3.6.tar.gz/source/common/version.cpp Changed

 
@@ -71,7 +71,7 @@
 #define ONOS    "Unk-OS"
 #endif
 
-#if X86_64
+#if defined(_LP64) || defined(_WIN64)
 #define BITS    "64 bit"
 #else
 #define BITS    "32 bit"
​

x265_3.5.tar.gz/source/common/x86/asm-primitives.cpp -> x265_3.6.tar.gz/source/common/x86/asm-primitives.cpp Changed

@@ -1091,6 +1091,7 @@
 
         p.frameInitLowres = PFX(frame_init_lowres_core_sse2);
         p.frameInitLowerRes = PFX(frame_init_lowres_core_sse2);
+        p.frameSubSampleLuma = PFX(frame_subsample_luma_sse2);
         // TODO: the planecopy_sp is really planecopy_SC now, must be fix it 
         //p.planecopy_sp = PFX(downShift_16_sse2);
         p.planecopy_sp_shl = PFX(upShift_16_sse2);
@@ -1121,6 +1122,7 @@
     {
         ASSIGN2(p.scale1D_128to64, scale1D_128to64_ssse3);
         p.scale2D_64to32 = PFX(scale2D_64to32_ssse3);
+        p.frameSubSampleLuma = PFX(frame_subsample_luma_ssse3);
 
         // p.puLUMA_4x4.satd = p.cuBLOCK_4x4.sa8d = PFX(pixel_satd_4x4_ssse3); this one is broken
         ALL_LUMA_PU(satd, pixel_satd, ssse3);
@@ -1462,6 +1464,7 @@
         p.puLUMA_64x48.copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x48_avx);
         p.puLUMA_64x64.copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x64_avx);
         p.propagateCost = PFX(mbtree_propagate_cost_avx);
+        p.frameSubSampleLuma = PFX(frame_subsample_luma_avx);
     }
     if (cpuMask & X265_CPU_XOP)
     {
@@ -1473,6 +1476,7 @@
         LUMA_VAR(xop);
         p.frameInitLowres = PFX(frame_init_lowres_core_xop);
         p.frameInitLowerRes = PFX(frame_init_lowres_core_xop);
+        p.frameSubSampleLuma = PFX(frame_subsample_luma_xop);
     }
     if (cpuMask & X265_CPU_AVX2)
     {
@@ -2301,6 +2305,9 @@
 
         p.frameInitLowres = PFX(frame_init_lowres_core_avx2);
         p.frameInitLowerRes = PFX(frame_init_lowres_core_avx2);
+
+        p.frameSubSampleLuma = PFX(frame_subsample_luma_avx2);
+
         p.propagateCost = PFX(mbtree_propagate_cost_avx2);
         p.fix8Unpack = PFX(cutree_fix8_unpack_avx2);
         p.fix8Pack = PFX(cutree_fix8_pack_avx2);
@@ -3300,6 +3307,7 @@
         //p.frameInitLowres = PFX(frame_init_lowres_core_mmx2);
         p.frameInitLowres = PFX(frame_init_lowres_core_sse2);
         p.frameInitLowerRes = PFX(frame_init_lowres_core_sse2);
+        p.frameSubSampleLuma = PFX(frame_subsample_luma_sse2);
 
         ALL_LUMA_TU(blockfill_sNONALIGNED, blockfill_s, sse2);
         ALL_LUMA_TU(blockfill_sALIGNED, blockfill_s, sse2);
@@ -3424,6 +3432,8 @@
         ASSIGN2(p.scale1D_128to64, scale1D_128to64_ssse3);
         p.scale2D_64to32 = PFX(scale2D_64to32_ssse3);
 
+        p.frameSubSampleLuma = PFX(frame_subsample_luma_ssse3);
+
         ASSIGN2(p.puLUMA_8x4.convert_p2s, filterPixelToShort_8x4_ssse3);
         ASSIGN2(p.puLUMA_8x8.convert_p2s, filterPixelToShort_8x8_ssse3);
         ASSIGN2(p.puLUMA_8x16.convert_p2s, filterPixelToShort_8x16_ssse3);
@@ -3691,6 +3701,7 @@
         p.frameInitLowres = PFX(frame_init_lowres_core_avx);
         p.frameInitLowerRes = PFX(frame_init_lowres_core_avx);
         p.propagateCost = PFX(mbtree_propagate_cost_avx);
+        p.frameSubSampleLuma = PFX(frame_subsample_luma_avx);
     }
     if (cpuMask & X265_CPU_XOP)
     {
@@ -3702,6 +3713,7 @@
         p.cuBLOCK_16x16.sse_pp = PFX(pixel_ssd_16x16_xop);
         p.frameInitLowres = PFX(frame_init_lowres_core_xop);
         p.frameInitLowerRes = PFX(frame_init_lowres_core_xop);
+        p.frameSubSampleLuma = PFX(frame_subsample_luma_xop);
 
     }
 #if X86_64
@@ -4684,6 +4696,8 @@
         p.saoCuStatsE2 = PFX(saoCuStatsE2_avx2);
         p.saoCuStatsE3 = PFX(saoCuStatsE3_avx2);
 
+        p.frameSubSampleLuma = PFX(frame_subsample_luma_avx2);
+
         if (cpuMask & X265_CPU_BMI2)
         {
             p.scanPosLast = PFX(scanPosLast_avx2_bmi2);

 
@@ -1091,6 +1091,7 @@
 
         p.frameInitLowres = PFX(frame_init_lowres_core_sse2);
         p.frameInitLowerRes = PFX(frame_init_lowres_core_sse2);
+        p.frameSubSampleLuma = PFX(frame_subsample_luma_sse2);
         // TODO: the planecopy_sp is really planecopy_SC now, must be fix it 
         //p.planecopy_sp = PFX(downShift_16_sse2);
         p.planecopy_sp_shl = PFX(upShift_16_sse2);
@@ -1121,6 +1122,7 @@
     {
         ASSIGN2(p.scale1D_128to64, scale1D_128to64_ssse3);
         p.scale2D_64to32 = PFX(scale2D_64to32_ssse3);
+        p.frameSubSampleLuma = PFX(frame_subsample_luma_ssse3);
 
         // p.puLUMA_4x4.satd = p.cuBLOCK_4x4.sa8d = PFX(pixel_satd_4x4_ssse3); this one is broken
         ALL_LUMA_PU(satd, pixel_satd, ssse3);
@@ -1462,6 +1464,7 @@
         p.puLUMA_64x48.copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x48_avx);
         p.puLUMA_64x64.copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x64_avx);
         p.propagateCost = PFX(mbtree_propagate_cost_avx);
+        p.frameSubSampleLuma = PFX(frame_subsample_luma_avx);
     }
     if (cpuMask & X265_CPU_XOP)
     {
@@ -1473,6 +1476,7 @@
         LUMA_VAR(xop);
         p.frameInitLowres = PFX(frame_init_lowres_core_xop);
         p.frameInitLowerRes = PFX(frame_init_lowres_core_xop);
+        p.frameSubSampleLuma = PFX(frame_subsample_luma_xop);
     }
     if (cpuMask & X265_CPU_AVX2)
     {
@@ -2301,6 +2305,9 @@
 
         p.frameInitLowres = PFX(frame_init_lowres_core_avx2);
         p.frameInitLowerRes = PFX(frame_init_lowres_core_avx2);
+
+        p.frameSubSampleLuma = PFX(frame_subsample_luma_avx2);
+
         p.propagateCost = PFX(mbtree_propagate_cost_avx2);
         p.fix8Unpack = PFX(cutree_fix8_unpack_avx2);
         p.fix8Pack = PFX(cutree_fix8_pack_avx2);
@@ -3300,6 +3307,7 @@
         //p.frameInitLowres = PFX(frame_init_lowres_core_mmx2);
         p.frameInitLowres = PFX(frame_init_lowres_core_sse2);
         p.frameInitLowerRes = PFX(frame_init_lowres_core_sse2);
+        p.frameSubSampleLuma = PFX(frame_subsample_luma_sse2);
 
         ALL_LUMA_TU(blockfill_sNONALIGNED, blockfill_s, sse2);
         ALL_LUMA_TU(blockfill_sALIGNED, blockfill_s, sse2);
@@ -3424,6 +3432,8 @@
         ASSIGN2(p.scale1D_128to64, scale1D_128to64_ssse3);
         p.scale2D_64to32 = PFX(scale2D_64to32_ssse3);
 
+        p.frameSubSampleLuma = PFX(frame_subsample_luma_ssse3);
+
         ASSIGN2(p.puLUMA_8x4.convert_p2s, filterPixelToShort_8x4_ssse3);
         ASSIGN2(p.puLUMA_8x8.convert_p2s, filterPixelToShort_8x8_ssse3);
         ASSIGN2(p.puLUMA_8x16.convert_p2s, filterPixelToShort_8x16_ssse3);
@@ -3691,6 +3701,7 @@
         p.frameInitLowres = PFX(frame_init_lowres_core_avx);
         p.frameInitLowerRes = PFX(frame_init_lowres_core_avx);
         p.propagateCost = PFX(mbtree_propagate_cost_avx);
+        p.frameSubSampleLuma = PFX(frame_subsample_luma_avx);
     }
     if (cpuMask & X265_CPU_XOP)
     {
@@ -3702,6 +3713,7 @@
         p.cuBLOCK_16x16.sse_pp = PFX(pixel_ssd_16x16_xop);
         p.frameInitLowres = PFX(frame_init_lowres_core_xop);
         p.frameInitLowerRes = PFX(frame_init_lowres_core_xop);
+        p.frameSubSampleLuma = PFX(frame_subsample_luma_xop);
 
     }
 #if X86_64
@@ -4684,6 +4696,8 @@
         p.saoCuStatsE2 = PFX(saoCuStatsE2_avx2);
         p.saoCuStatsE3 = PFX(saoCuStatsE3_avx2);
 
+        p.frameSubSampleLuma = PFX(frame_subsample_luma_avx2);
+
         if (cpuMask & X265_CPU_BMI2)
         {
             p.scanPosLast = PFX(scanPosLast_avx2_bmi2);
​

x265_3.5.tar.gz/source/common/x86/const-a.asm -> x265_3.6.tar.gz/source/common/x86/const-a.asm Changed

 
@@ -100,7 +100,7 @@
 const pw_2000,              times 16 dw 0x2000
 const pw_8000,              times  8 dw 0x8000
 const pw_3fff,              times 16 dw 0x3fff
-const pw_32_0,              times  4 dw 32,
+const pw_32_0,              times  4 dw 32
                             times  4 dw 0
 const pw_pixel_max,         times 16 dw ((1 << BIT_DEPTH)-1)
 
​

x265_3.5.tar.gz/source/common/x86/h-ipfilter8.asm -> x265_3.6.tar.gz/source/common/x86/h-ipfilter8.asm Changed

 
@@ -125,6 +125,9 @@
 ALIGN 32
 interp4_hps_shuf: times 2 db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12
 
+ALIGN 32
+const interp_4tap_8x8_horiz_shuf,   dd 0, 4, 1, 5, 2, 6, 3, 7
+
 SECTION .text
 
 cextern pw_1
@@ -1459,8 +1462,6 @@
 
     RET
 
-ALIGN 32
-const interp_4tap_8x8_horiz_shuf,   dd 0, 4, 1, 5, 2, 6, 3, 7
 
 %macro FILTER_H4_w6 3
     movu        %1, srcq - 1
​

x265_3.5.tar.gz/source/common/x86/mc-a2.asm -> x265_3.6.tar.gz/source/common/x86/mc-a2.asm Changed

@@ -992,6 +992,262 @@
 FRAME_INIT_LOWRES
 %endif
 
+%macro SUBSAMPLEFILT8x4 7
+    mova      %3, r0+%7
+    mova      %4, r0+r2+%7
+    pavgb     %3, %4
+    pavgb     %4, r0+r2*2+%7
+    PALIGNR   %1, %3, 1, m6
+    PALIGNR   %2, %4, 1, m6
+%if cpuflag(xop)
+    pavgb     %1, %3
+    pavgb     %2, %4
+%else
+    pavgb     %1, %3
+    pavgb     %2, %4
+    psrlw     %5, %1, 8
+    psrlw     %6, %2, 8
+    pand      %1, m7
+    pand      %2, m7
+%endif
+%endmacro
+
+%macro SUBSAMPLEFILT32x4U 1
+    movu      m1, r0+r2
+    pavgb     m0, m1, r0
+    movu      m3, r0+r2+1
+    pavgb     m2, m3, r0+1
+    pavgb     m1, r0+r2*2
+    pavgb     m3, r0+r2*2+1
+    pavgb     m0, m2
+    pavgb     m1, m3
+
+    movu      m3, r0+r2+mmsize
+    pavgb     m2, m3, r0+mmsize
+    movu      m5, r0+r2+1+mmsize
+    pavgb     m4, m5, r0+1+mmsize
+    pavgb     m2, m4
+
+    pshufb    m0, m7
+    pshufb    m2, m7
+    punpcklqdq m0, m0, m2
+    vpermq    m0, m0, q3120
+    movu    %1, m0
+%endmacro
+
+%macro SUBSAMPLEFILT16x2 3
+    mova      m3, r0+%3+mmsize
+    mova      m2, r0+%3
+    pavgb     m3, r0+%3+r2+mmsize
+    pavgb     m2, r0+%3+r2
+    PALIGNR   %1, m3, 1, m6
+    pavgb     %1, m3
+    PALIGNR   m3, m2, 1, m6
+    pavgb     m3, m2
+%if cpuflag(xop)
+    vpperm    m3, m3, %1, m6
+%else
+    pand      m3, m7
+    pand      %1, m7
+    packuswb  m3, %1
+%endif
+    mova    %2, m3
+    mova      %1, m2
+%endmacro
+
+%macro SUBSAMPLEFILT8x2U 2
+    mova      m2, r0+%2
+    pavgb     m2, r0+%2+r2
+    mova      m0, r0+%2+1
+    pavgb     m0, r0+%2+r2+1
+    pavgb     m1, m3
+    pavgb     m0, m2
+    pand      m1, m7
+    pand      m0, m7
+    packuswb  m0, m1
+    mova    %1, m0
+%endmacro
+
+%macro SUBSAMPLEFILT8xU 2
+    mova      m3, r0+%2+8
+    mova      m2, r0+%2
+    pavgw     m3, r0+%2+r2+8
+    pavgw     m2, r0+%2+r2
+    movu      m1, r0+%2+10
+    movu      m0, r0+%2+2
+    pavgw     m1, r0+%2+r2+10
+    pavgw     m0, r0+%2+r2+2
+    pavgw     m1, m3
+    pavgw     m0, m2
+    psrld     m3, m1, 16
+    pand      m1, m7
+    pand      m0, m7
+    packssdw  m0, m1
+    movu    %1, m0
+%endmacro
+
+%macro SUBSAMPLEFILT8xA 3
+    movu      m3, r0+%3+mmsize
+    movu      m2, r0+%3
+    pavgw     m3, r0+%3+r2+mmsize
+    pavgw     m2, r0+%3+r2
+    PALIGNR   %1, m3, 2, m6
+    pavgw     %1, m3
+    PALIGNR   m3, m2, 2, m6
+    pavgw     m3, m2
+%if cpuflag(xop)
+    vpperm    m3, m3, %1, m6
+%else
+    pand      m3, m7
+    pand      %1, m7
+    packssdw  m3, %1
+%endif
+%if cpuflag(avx2)
+    vpermq     m3, m3, q3120
+%endif
+    movu    %2, m3
+    movu      %1, m2
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void frame_subsample_luma( uint8_t *src0, uint8_t *dst0,
+;                              intptr_t src_stride, intptr_t dst_stride, int width, int height )
+;-----------------------------------------------------------------------------
+
+%macro FRAME_SUBSAMPLE_LUMA 0
+cglobal frame_subsample_luma, 6,7,(12-4*(BIT_DEPTH/9)) ; 8 for HIGH_BIT_DEPTH, 12 otherwise
+%if HIGH_BIT_DEPTH
+    shl   dword r3m, 1
+    FIX_STRIDES r2
+    shl   dword r4m, 1
+%endif
+%if mmsize >= 16
+    add   dword r4m, mmsize-1
+    and   dword r4m, ~(mmsize-1)
+%endif
+    ; src += 2*(height-1)*stride + 2*width
+    mov      r6d, r5m
+    dec      r6d
+    imul     r6d, r2d
+    add      r6d, r4m
+    lea       r0, r0+r6*2
+    ; dst += (height-1)*stride + width
+    mov      r6d, r5m
+    dec      r6d
+    imul     r6d, r3m
+    add      r6d, r4m
+    add       r1, r6
+    ; gap = stride - width
+    mov      r6d, r3m
+    sub      r6d, r4m
+    PUSH      r6
+    %define dst_gap rsp+gprsize
+    mov      r6d, r2d
+    sub      r6d, r4m
+    shl      r6d, 1
+    PUSH      r6
+    %define src_gap rsp
+%if HIGH_BIT_DEPTH
+%if cpuflag(xop)
+    mova      m6, deinterleave_shuf32a
+    mova      m7, deinterleave_shuf32b
+%else
+    pcmpeqw   m7, m7
+    psrld     m7, 16
+%endif
+.vloop:
+    mov      r6d, r4m
+%ifnidn cpuname, mmx2
+    movu      m0, r0
+    movu      m1, r0+r2
+    pavgw     m0, m1
+    pavgw     m1, r0+r2*2
+%endif
+.hloop:
+    sub       r0, mmsize*2
+    sub       r1, mmsize
+%ifidn cpuname, mmx2
+    SUBSAMPLEFILT8xU r1, 0
+%else
+    SUBSAMPLEFILT8xA m0, r1, 0
+%endif
+    sub      r6d, mmsize
+    jg .hloop
+%else ; !HIGH_BIT_DEPTH
+%if cpuflag(avx2)
+    mova      m7, deinterleave_shuf
+%elif cpuflag(xop)
+    mova      m6, deinterleave_shuf32a
+    mova      m7, deinterleave_shuf32b
+%else
+    pcmpeqb   m7, m7
+    psrlw     m7, 8
+%endif
+.vloop:
+    mov      r6d, r4m
+%ifnidn cpuname, mmx2
+%if mmsize <= 16
+    mova      m0, r0

 
@@ -992,6 +992,262 @@
 FRAME_INIT_LOWRES
 %endif
 
+%macro SUBSAMPLEFILT8x4 7
+    mova      %3, r0+%7
+    mova      %4, r0+r2+%7
+    pavgb     %3, %4
+    pavgb     %4, r0+r2*2+%7
+    PALIGNR   %1, %3, 1, m6
+    PALIGNR   %2, %4, 1, m6
+%if cpuflag(xop)
+    pavgb     %1, %3
+    pavgb     %2, %4
+%else
+    pavgb     %1, %3
+    pavgb     %2, %4
+    psrlw     %5, %1, 8
+    psrlw     %6, %2, 8
+    pand      %1, m7
+    pand      %2, m7
+%endif
+%endmacro
+
+%macro SUBSAMPLEFILT32x4U 1
+    movu      m1, r0+r2
+    pavgb     m0, m1, r0
+    movu      m3, r0+r2+1
+    pavgb     m2, m3, r0+1
+    pavgb     m1, r0+r2*2
+    pavgb     m3, r0+r2*2+1
+    pavgb     m0, m2
+    pavgb     m1, m3
+
+    movu      m3, r0+r2+mmsize
+    pavgb     m2, m3, r0+mmsize
+    movu      m5, r0+r2+1+mmsize
+    pavgb     m4, m5, r0+1+mmsize
+    pavgb     m2, m4
+
+    pshufb    m0, m7
+    pshufb    m2, m7
+    punpcklqdq m0, m0, m2
+    vpermq    m0, m0, q3120
+    movu    %1, m0
+%endmacro
+
+%macro SUBSAMPLEFILT16x2 3
+    mova      m3, r0+%3+mmsize
+    mova      m2, r0+%3
+    pavgb     m3, r0+%3+r2+mmsize
+    pavgb     m2, r0+%3+r2
+    PALIGNR   %1, m3, 1, m6
+    pavgb     %1, m3
+    PALIGNR   m3, m2, 1, m6
+    pavgb     m3, m2
+%if cpuflag(xop)
+    vpperm    m3, m3, %1, m6
+%else
+    pand      m3, m7
+    pand      %1, m7
+    packuswb  m3, %1
+%endif
+    mova    %2, m3
+    mova      %1, m2
+%endmacro
+
+%macro SUBSAMPLEFILT8x2U 2
+    mova      m2, r0+%2
+    pavgb     m2, r0+%2+r2
+    mova      m0, r0+%2+1
+    pavgb     m0, r0+%2+r2+1
+    pavgb     m1, m3
+    pavgb     m0, m2
+    pand      m1, m7
+    pand      m0, m7
+    packuswb  m0, m1
+    mova    %1, m0
+%endmacro
+
+%macro SUBSAMPLEFILT8xU 2
+    mova      m3, r0+%2+8
+    mova      m2, r0+%2
+    pavgw     m3, r0+%2+r2+8
+    pavgw     m2, r0+%2+r2
+    movu      m1, r0+%2+10
+    movu      m0, r0+%2+2
+    pavgw     m1, r0+%2+r2+10
+    pavgw     m0, r0+%2+r2+2
+    pavgw     m1, m3
+    pavgw     m0, m2
+    psrld     m3, m1, 16
+    pand      m1, m7
+    pand      m0, m7
+    packssdw  m0, m1
+    movu    %1, m0
+%endmacro
+
+%macro SUBSAMPLEFILT8xA 3
+    movu      m3, r0+%3+mmsize
+    movu      m2, r0+%3
+    pavgw     m3, r0+%3+r2+mmsize
+    pavgw     m2, r0+%3+r2
+    PALIGNR   %1, m3, 2, m6
+    pavgw     %1, m3
+    PALIGNR   m3, m2, 2, m6
+    pavgw     m3, m2
+%if cpuflag(xop)
+    vpperm    m3, m3, %1, m6
+%else
+    pand      m3, m7
+    pand      %1, m7
+    packssdw  m3, %1
+%endif
+%if cpuflag(avx2)
+    vpermq     m3, m3, q3120
+%endif
+    movu    %2, m3
+    movu      %1, m2
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void frame_subsample_luma( uint8_t *src0, uint8_t *dst0,
+;                              intptr_t src_stride, intptr_t dst_stride, int width, int height )
+;-----------------------------------------------------------------------------
+
+%macro FRAME_SUBSAMPLE_LUMA 0
+cglobal frame_subsample_luma, 6,7,(12-4*(BIT_DEPTH/9)) ; 8 for HIGH_BIT_DEPTH, 12 otherwise
+%if HIGH_BIT_DEPTH
+    shl   dword r3m, 1
+    FIX_STRIDES r2
+    shl   dword r4m, 1
+%endif
+%if mmsize >= 16
+    add   dword r4m, mmsize-1
+    and   dword r4m, ~(mmsize-1)
+%endif
+    ; src += 2*(height-1)*stride + 2*width
+    mov      r6d, r5m
+    dec      r6d
+    imul     r6d, r2d
+    add      r6d, r4m
+    lea       r0, r0+r6*2
+    ; dst += (height-1)*stride + width
+    mov      r6d, r5m
+    dec      r6d
+    imul     r6d, r3m
+    add      r6d, r4m
+    add       r1, r6
+    ; gap = stride - width
+    mov      r6d, r3m
+    sub      r6d, r4m
+    PUSH      r6
+    %define dst_gap rsp+gprsize
+    mov      r6d, r2d
+    sub      r6d, r4m
+    shl      r6d, 1
+    PUSH      r6
+    %define src_gap rsp
+%if HIGH_BIT_DEPTH
+%if cpuflag(xop)
+    mova      m6, deinterleave_shuf32a
+    mova      m7, deinterleave_shuf32b
+%else
+    pcmpeqw   m7, m7
+    psrld     m7, 16
+%endif
+.vloop:
+    mov      r6d, r4m
+%ifnidn cpuname, mmx2
+    movu      m0, r0
+    movu      m1, r0+r2
+    pavgw     m0, m1
+    pavgw     m1, r0+r2*2
+%endif
+.hloop:
+    sub       r0, mmsize*2
+    sub       r1, mmsize
+%ifidn cpuname, mmx2
+    SUBSAMPLEFILT8xU r1, 0
+%else
+    SUBSAMPLEFILT8xA m0, r1, 0
+%endif
+    sub      r6d, mmsize
+    jg .hloop
+%else ; !HIGH_BIT_DEPTH
+%if cpuflag(avx2)
+    mova      m7, deinterleave_shuf
+%elif cpuflag(xop)
+    mova      m6, deinterleave_shuf32a
+    mova      m7, deinterleave_shuf32b
+%else
+    pcmpeqb   m7, m7
+    psrlw     m7, 8
+%endif
+.vloop:
+    mov      r6d, r4m
+%ifnidn cpuname, mmx2
+%if mmsize <= 16
+    mova      m0, r0
​

x265_3.5.tar.gz/source/common/x86/mc.h -> x265_3.6.tar.gz/source/common/x86/mc.h Changed

 
@@ -36,6 +36,17 @@
 
 #undef LOWRES
 
+#define SUBSAMPLELUMA(cpu) \
+    void PFX(frame_subsample_luma_ ## cpu)(const pixel* src0, pixel* dst0, intptr_t src_stride, intptr_t dst_stride, int width, int height);
+SUBSAMPLELUMA(mmx2)
+SUBSAMPLELUMA(sse2)
+SUBSAMPLELUMA(ssse3)
+SUBSAMPLELUMA(avx)
+SUBSAMPLELUMA(avx2)
+SUBSAMPLELUMA(xop)
+
+#undef SUBSAMPLELUMA
+
 #define PROPAGATE_COST(cpu) \
     void PFX(mbtree_propagate_cost_ ## cpu)(int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, \
                                               const uint16_t* interCosts, const int32_t* invQscales, const double* fpsFactor, int len);
​

x265_3.5.tar.gz/source/common/x86/x86inc.asm -> x265_3.6.tar.gz/source/common/x86/x86inc.asm Changed

@@ -401,16 +401,6 @@
     %endif
 %endmacro
 
-%macro DEFINE_ARGS_INTERNAL 3+
-    %ifnum %2
-        DEFINE_ARGS %3
-    %elif %1 == 4
-        DEFINE_ARGS %2
-    %elif %1 > 4
-        DEFINE_ARGS %2, %3
-    %endif
-%endmacro
-
 %if WIN64 ; Windows x64 ;=================================================
 
 DECLARE_REG 0,  rcx
@@ -429,7 +419,7 @@
 DECLARE_REG 13, R12, 112
 DECLARE_REG 14, R13, 120
 
-%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, stack_size, arg_names...
+%macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, stack_size, arg_names...
     %assign num_args %1
     %assign regs_used %2
     ASSERT regs_used >= num_args
@@ -441,7 +431,15 @@
         WIN64_SPILL_XMM %3
     %endif
     LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
-    DEFINE_ARGS_INTERNAL %0, %4, %5
+    %if %0 > 4
+         %ifnum %4
+             DEFINE_ARGS %5
+         %else
+             DEFINE_ARGS %4, %5
+         %endif
+     %elifnnum %4
+         DEFINE_ARGS %4
+     %endif
 %endmacro
 
 %macro WIN64_PUSH_XMM 0
@@ -537,7 +535,7 @@
 DECLARE_REG 13, R12, 64
 DECLARE_REG 14, R13, 72
 
-%macro PROLOGUE 2-5+ 0; #args, #regs, #xmm_regs, stack_size, arg_names...
+%macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, stack_size, arg_names...
     %assign num_args %1
     %assign regs_used %2
     %assign xmm_regs_used %3
@@ -547,7 +545,15 @@
     PUSH_IF_USED 9, 10, 11, 12, 13, 14
     ALLOC_STACK %4
     LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
-    DEFINE_ARGS_INTERNAL %0, %4, %5
+    %if %0 > 4
+         %ifnum %4
+             DEFINE_ARGS %5
+         %else
+             DEFINE_ARGS %4, %5
+         %endif
+     %elifnnum %4
+         DEFINE_ARGS %4
+     %endif
 %endmacro
 
 %define has_epilogue regs_used > 9 || stack_size > 0 || vzeroupper_required
@@ -588,7 +594,7 @@
 
 DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
 
-%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, stack_size, arg_names...
+%macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, stack_size, arg_names...
     %assign num_args %1
     %assign regs_used %2
     ASSERT regs_used >= num_args
@@ -603,7 +609,15 @@
     PUSH_IF_USED 3, 4, 5, 6
     ALLOC_STACK %4
     LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
-    DEFINE_ARGS_INTERNAL %0, %4, %5
+    %if %0 > 4
+         %ifnum %4
+             DEFINE_ARGS %5
+         %else
+             DEFINE_ARGS %4, %5
+         %endif
+     %elifnnum %4
+         DEFINE_ARGS %4
+     %endif
 %endmacro
 
 %define has_epilogue regs_used > 3 || stack_size > 0 || vzeroupper_required

 
@@ -401,16 +401,6 @@
     %endif
 %endmacro
 
-%macro DEFINE_ARGS_INTERNAL 3+
-    %ifnum %2
-        DEFINE_ARGS %3
-    %elif %1 == 4
-        DEFINE_ARGS %2
-    %elif %1 > 4
-        DEFINE_ARGS %2, %3
-    %endif
-%endmacro
-
 %if WIN64 ; Windows x64 ;=================================================
 
 DECLARE_REG 0,  rcx
@@ -429,7 +419,7 @@
 DECLARE_REG 13, R12, 112
 DECLARE_REG 14, R13, 120
 
-%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, stack_size, arg_names...
+%macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, stack_size, arg_names...
     %assign num_args %1
     %assign regs_used %2
     ASSERT regs_used >= num_args
@@ -441,7 +431,15 @@
         WIN64_SPILL_XMM %3
     %endif
     LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
-    DEFINE_ARGS_INTERNAL %0, %4, %5
+    %if %0 > 4
+         %ifnum %4
+             DEFINE_ARGS %5
+         %else
+             DEFINE_ARGS %4, %5
+         %endif
+     %elifnnum %4
+         DEFINE_ARGS %4
+     %endif
 %endmacro
 
 %macro WIN64_PUSH_XMM 0
@@ -537,7 +535,7 @@
 DECLARE_REG 13, R12, 64
 DECLARE_REG 14, R13, 72
 
-%macro PROLOGUE 2-5+ 0; #args, #regs, #xmm_regs, stack_size, arg_names...
+%macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, stack_size, arg_names...
     %assign num_args %1
     %assign regs_used %2
     %assign xmm_regs_used %3
@@ -547,7 +545,15 @@
     PUSH_IF_USED 9, 10, 11, 12, 13, 14
     ALLOC_STACK %4
     LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
-    DEFINE_ARGS_INTERNAL %0, %4, %5
+    %if %0 > 4
+         %ifnum %4
+             DEFINE_ARGS %5
+         %else
+             DEFINE_ARGS %4, %5
+         %endif
+     %elifnnum %4
+         DEFINE_ARGS %4
+     %endif
 %endmacro
 
 %define has_epilogue regs_used > 9 || stack_size > 0 || vzeroupper_required
@@ -588,7 +594,7 @@
 
 DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
 
-%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, stack_size, arg_names...
+%macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, stack_size, arg_names...
     %assign num_args %1
     %assign regs_used %2
     ASSERT regs_used >= num_args
@@ -603,7 +609,15 @@
     PUSH_IF_USED 3, 4, 5, 6
     ALLOC_STACK %4
     LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
-    DEFINE_ARGS_INTERNAL %0, %4, %5
+    %if %0 > 4
+         %ifnum %4
+             DEFINE_ARGS %5
+         %else
+             DEFINE_ARGS %4, %5
+         %endif
+     %elifnnum %4
+         DEFINE_ARGS %4
+     %endif
 %endmacro
 
 %define has_epilogue regs_used > 3 || stack_size > 0 || vzeroupper_required
​

x265_3.5.tar.gz/source/common/x86/x86util.asm -> x265_3.6.tar.gz/source/common/x86/x86util.asm Changed

 
@@ -578,8 +578,10 @@
     %elif %1==2
         %if mmsize==8
             SBUTTERFLY dq, %3, %4, %5
-        %else
+        %elif %0==6
             TRANS q, ORDER, %3, %4, %5, %6
+        %else
+            TRANS q, ORDER, %3, %4, %5
         %endif
     %elif %1==4
         SBUTTERFLY qdq, %3, %4, %5
​

x265_3.5.tar.gz/source/encoder/analysis.cpp -> x265_3.6.tar.gz/source/encoder/analysis.cpp Changed

 
@@ -3645,7 +3645,7 @@
             qp += distortionData->offsetctu.m_cuAddr;
     }
 
-    if (m_param->analysisLoadReuseLevel == 10 && m_param->rc.cuTree)
+    if (m_param->analysisLoadReuseLevel >= 2 && m_param->rc.cuTree)
     {
         int cuIdx = (ctu.m_cuAddr * ctu.m_numPartitions) + cuGeom.absPartIdx;
         if (ctu.m_slice->m_sliceType == I_SLICE)
​

x265_3.5.tar.gz/source/encoder/api.cpp -> x265_3.6.tar.gz/source/encoder/api.cpp Changed

@@ -208,7 +208,6 @@
     memcpy(zoneParam, param, sizeof(x265_param));
     for (int i = 0; i < param->rc.zonefileCount; i++)
     {
-        param->rc.zonesi.startFrame = -1;
         encoder->configureZone(zoneParam, param->rc.zonesi.zoneParam);
     }
 
@@ -608,6 +607,14 @@
     if (numEncoded < 0)
         encoder->m_aborted = true;
 
+    if ((!encoder->m_numDelayedPic && !numEncoded) && (encoder->m_param->bEnableEndOfSequence || encoder->m_param->bEnableEndOfBitstream))
+    {
+        Bitstream bs;
+        encoder->getEndNalUnits(encoder->m_nalList, bs);
+        *pp_nal = &encoder->m_nalList.m_nal0;
+        if (pi_nal) *pi_nal = encoder->m_nalList.m_numNal;
+    }
+
     return numEncoded;
 }
 
@@ -1042,6 +1049,7 @@
     &PARAM_NS::x265_param_free,
     &PARAM_NS::x265_param_default,
     &PARAM_NS::x265_param_parse,
+    &PARAM_NS::x265_scenecut_aware_qp_param_parse,
     &PARAM_NS::x265_param_apply_profile,
     &PARAM_NS::x265_param_default_preset,
     &x265_picture_alloc,
@@ -1288,6 +1296,8 @@
             if (param->csvLogLevel)
             {
                 fprintf(csvfp, "Encode Order, Type, POC, QP, Bits, Scenecut, ");
+                if (!!param->bEnableTemporalSubLayers)
+                    fprintf(csvfp, "Temporal Sub Layer ID, ");
                 if (param->csvLogLevel >= 2)
                     fprintf(csvfp, "I/P cost ratio, ");
                 if (param->rc.rateControlMode == X265_RC_CRF)
@@ -1401,6 +1411,8 @@
     const x265_frame_stats* frameStats = &pic->frameData;
     fprintf(param->csvfpt, "%d, %c-SLICE, %4d, %2.2lf, %10d, %d,", frameStats->encoderOrder, frameStats->sliceType, frameStats->poc,
                                                                    frameStats->qp, (int)frameStats->bits, frameStats->bScenecut);
+    if (!!param->bEnableTemporalSubLayers)
+        fprintf(param->csvfpt, "%d,", frameStats->tLayer);
     if (param->csvLogLevel >= 2)
         fprintf(param->csvfpt, "%.2f,", frameStats->ipCostRatio);
     if (param->rc.rateControlMode == X265_RC_CRF)

 
@@ -208,7 +208,6 @@
     memcpy(zoneParam, param, sizeof(x265_param));
     for (int i = 0; i < param->rc.zonefileCount; i++)
     {
-        param->rc.zonesi.startFrame = -1;
         encoder->configureZone(zoneParam, param->rc.zonesi.zoneParam);
     }
 
@@ -608,6 +607,14 @@
     if (numEncoded < 0)
         encoder->m_aborted = true;
 
+    if ((!encoder->m_numDelayedPic && !numEncoded) && (encoder->m_param->bEnableEndOfSequence || encoder->m_param->bEnableEndOfBitstream))
+    {
+        Bitstream bs;
+        encoder->getEndNalUnits(encoder->m_nalList, bs);
+        *pp_nal = &encoder->m_nalList.m_nal0;
+        if (pi_nal) *pi_nal = encoder->m_nalList.m_numNal;
+    }
+
     return numEncoded;
 }
 
@@ -1042,6 +1049,7 @@
     &PARAM_NS::x265_param_free,
     &PARAM_NS::x265_param_default,
     &PARAM_NS::x265_param_parse,
+    &PARAM_NS::x265_scenecut_aware_qp_param_parse,
     &PARAM_NS::x265_param_apply_profile,
     &PARAM_NS::x265_param_default_preset,
     &x265_picture_alloc,
@@ -1288,6 +1296,8 @@
             if (param->csvLogLevel)
             {
                 fprintf(csvfp, "Encode Order, Type, POC, QP, Bits, Scenecut, ");
+                if (!!param->bEnableTemporalSubLayers)
+                    fprintf(csvfp, "Temporal Sub Layer ID, ");
                 if (param->csvLogLevel >= 2)
                     fprintf(csvfp, "I/P cost ratio, ");
                 if (param->rc.rateControlMode == X265_RC_CRF)
@@ -1401,6 +1411,8 @@
     const x265_frame_stats* frameStats = &pic->frameData;
     fprintf(param->csvfpt, "%d, %c-SLICE, %4d, %2.2lf, %10d, %d,", frameStats->encoderOrder, frameStats->sliceType, frameStats->poc,
                                                                    frameStats->qp, (int)frameStats->bits, frameStats->bScenecut);
+    if (!!param->bEnableTemporalSubLayers)
+        fprintf(param->csvfpt, "%d,", frameStats->tLayer);
     if (param->csvLogLevel >= 2)
         fprintf(param->csvfpt, "%.2f,", frameStats->ipCostRatio);
     if (param->rc.rateControlMode == X265_RC_CRF)
​

x265_3.5.tar.gz/source/encoder/dpb.cpp -> x265_3.6.tar.gz/source/encoder/dpb.cpp Changed

@@ -70,10 +70,18 @@
     {
         Frame *curFrame = iterFrame;
         iterFrame = iterFrame->m_next;
-        if (!curFrame->m_encData->m_bHasReferences && !curFrame->m_countRefEncoders)
+        bool isMCSTFReferenced = false;
+
+        if (curFrame->m_param->bEnableTemporalFilter)
+            isMCSTFReferenced =!!(curFrame->m_refPicCnt1);
+
+        if (!curFrame->m_encData->m_bHasReferences && !curFrame->m_countRefEncoders && !isMCSTFReferenced)
         {
             curFrame->m_bChromaExtended = false;
 
+            if (curFrame->m_param->bEnableTemporalFilter)
+                *curFrame->m_isSubSampled = false;
+
             // Reset column counter
             X265_CHECK(curFrame->m_reconRowFlag != NULL, "curFrame->m_reconRowFlag check failure");
             X265_CHECK(curFrame->m_reconColCount != NULL, "curFrame->m_reconColCount check failure");
@@ -142,12 +150,13 @@
     {
         newFrame->m_encData->m_bHasReferences = false;
 
+        newFrame->m_tempLayer = (newFrame->m_param->bEnableTemporalSubLayers && !m_bTemporalSublayer) ? 1 : newFrame->m_tempLayer;
         // Adjust NAL type for unreferenced B frames (change from _R "referenced"
         // to _N "non-referenced" NAL unit type)
         switch (slice->m_nalUnitType)
         {
         case NAL_UNIT_CODED_SLICE_TRAIL_R:
-            slice->m_nalUnitType = m_bTemporalSublayer ? NAL_UNIT_CODED_SLICE_TSA_N : NAL_UNIT_CODED_SLICE_TRAIL_N;
+            slice->m_nalUnitType = newFrame->m_param->bEnableTemporalSubLayers ? NAL_UNIT_CODED_SLICE_TSA_N : NAL_UNIT_CODED_SLICE_TRAIL_N;
             break;
         case NAL_UNIT_CODED_SLICE_RADL_R:
             slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_RADL_N;
@@ -168,13 +177,94 @@
 
     m_picList.pushFront(*newFrame);
 
+    if (m_bTemporalSublayer && getTemporalLayerNonReferenceFlag())
+    {
+        switch (slice->m_nalUnitType)
+        {
+        case NAL_UNIT_CODED_SLICE_TRAIL_R:
+            slice->m_nalUnitType =  NAL_UNIT_CODED_SLICE_TRAIL_N;
+            break;
+        case NAL_UNIT_CODED_SLICE_RADL_R:
+            slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_RADL_N;
+            break;
+        case NAL_UNIT_CODED_SLICE_RASL_R:
+            slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_RASL_N;
+            break;
+        default:
+            break;
+        }
+    }
     // Do decoding refresh marking if any
     decodingRefreshMarking(pocCurr, slice->m_nalUnitType);
 
-    computeRPS(pocCurr, slice->isIRAP(), &slice->m_rps, slice->m_sps->maxDecPicBuffering);
-
+    computeRPS(pocCurr, newFrame->m_tempLayer, slice->isIRAP(), &slice->m_rps, slice->m_sps->maxDecPicBufferingnewFrame->m_tempLayer);
+    bool isTSAPic = ((slice->m_nalUnitType == 2) || (slice->m_nalUnitType == 3)) ? true : false;
     // Mark pictures in m_piclist as unreferenced if they are not included in RPS
-    applyReferencePictureSet(&slice->m_rps, pocCurr);
+    applyReferencePictureSet(&slice->m_rps, pocCurr, newFrame->m_tempLayer, isTSAPic);
+
+
+    if (m_bTemporalSublayer && newFrame->m_tempLayer > 0
+        && !(slice->m_nalUnitType == NAL_UNIT_CODED_SLICE_RADL_N     // Check if not a leading picture
+            || slice->m_nalUnitType == NAL_UNIT_CODED_SLICE_RADL_R
+            || slice->m_nalUnitType == NAL_UNIT_CODED_SLICE_RASL_N
+            || slice->m_nalUnitType == NAL_UNIT_CODED_SLICE_RASL_R)
+        )
+    {
+        if (isTemporalLayerSwitchingPoint(pocCurr, newFrame->m_tempLayer) || (slice->m_sps->maxTempSubLayers == 1))
+        {
+            if (getTemporalLayerNonReferenceFlag())
+            {
+                slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_TSA_N;
+            }
+            else
+            {
+                slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_TSA_R;
+            }
+        }
+        else if (isStepwiseTemporalLayerSwitchingPoint(&slice->m_rps, pocCurr, newFrame->m_tempLayer))
+        {
+            bool isSTSA = true;
+            int id = newFrame->m_gopOffset % x265_gop_ra_lengthnewFrame->m_gopId;
+            for (int ii = id; (ii < x265_gop_ra_lengthnewFrame->m_gopId && isSTSA == true); ii++)
+            {
+                int tempIdRef = x265_gop_ranewFrame->m_gopIdii.layer;
+                if (tempIdRef == newFrame->m_tempLayer)
+                {
+                    for (int jj = 0; jj < slice->m_rps.numberOfPositivePictures + slice->m_rps.numberOfNegativePictures; jj++)
+                    {
+                        if (slice->m_rps.bUsedjj)
+                        {
+                            int refPoc = x265_gop_ranewFrame->m_gopIdii.poc_offset + slice->m_rps.deltaPOCjj;
+                            int kk = 0;
+                            for (kk = 0; kk < x265_gop_ra_lengthnewFrame->m_gopId; kk++)
+                            {
+                                if (x265_gop_ranewFrame->m_gopIdkk.poc_offset == refPoc)
+                                {
+                                    break;
+                                }
+                            }
+                            if (x265_gop_ranewFrame->m_gopIdkk.layer >= newFrame->m_tempLayer)
+                            {
+                                isSTSA = false;
+                                break;
+                            }
+                        }
+                    }
+                }
+            }
+            if (isSTSA == true)
+            {
+                if (getTemporalLayerNonReferenceFlag())
+                {
+                    slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_STSA_N;
+                }
+                else
+                {
+                    slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_STSA_R;
+                }
+            }
+        }
+    }
 
     if (slice->m_sliceType != I_SLICE)
         slice->m_numRefIdx0 = x265_clip3(1, newFrame->m_param->maxNumReferences, slice->m_rps.numberOfNegativePictures);
@@ -218,7 +308,7 @@
     }
 }
 
-void DPB::computeRPS(int curPoc, bool isRAP, RPS * rps, unsigned int maxDecPicBuffer)
+void DPB::computeRPS(int curPoc, int tempId, bool isRAP, RPS * rps, unsigned int maxDecPicBuffer)
 {
     unsigned int poci = 0, numNeg = 0, numPos = 0;
 
@@ -228,7 +318,7 @@
     {
         if ((iterPic->m_poc != curPoc) && iterPic->m_encData->m_bHasReferences)
         {
-            if ((m_lastIDR >= curPoc) || (m_lastIDR <= iterPic->m_poc))
+            if ((!m_bTemporalSublayer || (iterPic->m_tempLayer <= tempId)) && ((m_lastIDR >= curPoc) || (m_lastIDR <= iterPic->m_poc)))
             {
                     rps->pocpoci = iterPic->m_poc;
                     rps->deltaPOCpoci = rps->pocpoci - curPoc;
@@ -247,6 +337,18 @@
     rps->sortDeltaPOC();
 }
 
+bool DPB::getTemporalLayerNonReferenceFlag()
+{
+    Frame* curFrame = m_picList.first();
+    if (curFrame->m_encData->m_bHasReferences)
+    {
+        curFrame->m_sameLayerRefPic = true;
+        return false;
+    }
+    else
+        return true;
+}
+
 /* Marking reference pictures when an IDR/CRA is encountered. */
 void DPB::decodingRefreshMarking(int pocCurr, NalUnitType nalUnitType)
 {
@@ -296,7 +398,7 @@
 }
 
 /** Function for applying picture marking based on the Reference Picture Set */
-void DPB::applyReferencePictureSet(RPS *rps, int curPoc)
+void DPB::applyReferencePictureSet(RPS *rps, int curPoc, int tempId, bool isTSAPicture)
 {
     // loop through all pictures in the reference picture buffer
     Frame* iterFrame = m_picList.first();
@@ -317,9 +419,68 @@
             }
             if (!referenced)
                 iterFrame->m_encData->m_bHasReferences = false;
+
+            if (m_bTemporalSublayer)
+            {
+                //check that pictures of higher temporal layers are not used
+                assert(referenced == 0 || iterFrame->m_encData->m_bHasReferences == false || iterFrame->m_tempLayer <= tempId);
+
+                //check that pictures of higher or equal temporal layer are not in the RPS if the current picture is a TSA picture
+                if (isTSAPicture)
+                {
+                    assert(referenced == 0 || iterFrame->m_tempLayer < tempId);
+                }
+                //check that pictures marked as temporal layer non-reference pictures are not used for reference
+                if (iterFrame->m_tempLayer == tempId)
+                {
+                    assert(referenced == 0 || iterFrame->m_sameLayerRefPic == true);
+                }
+            }

 
@@ -70,10 +70,18 @@
     {
         Frame *curFrame = iterFrame;
         iterFrame = iterFrame->m_next;
-        if (!curFrame->m_encData->m_bHasReferences && !curFrame->m_countRefEncoders)
+        bool isMCSTFReferenced = false;
+
+        if (curFrame->m_param->bEnableTemporalFilter)
+            isMCSTFReferenced =!!(curFrame->m_refPicCnt1);
+
+        if (!curFrame->m_encData->m_bHasReferences && !curFrame->m_countRefEncoders && !isMCSTFReferenced)
         {
             curFrame->m_bChromaExtended = false;
 
+            if (curFrame->m_param->bEnableTemporalFilter)
+                *curFrame->m_isSubSampled = false;
+
             // Reset column counter
             X265_CHECK(curFrame->m_reconRowFlag != NULL, "curFrame->m_reconRowFlag check failure");
             X265_CHECK(curFrame->m_reconColCount != NULL, "curFrame->m_reconColCount check failure");
@@ -142,12 +150,13 @@
     {
         newFrame->m_encData->m_bHasReferences = false;
 
+        newFrame->m_tempLayer = (newFrame->m_param->bEnableTemporalSubLayers && !m_bTemporalSublayer) ? 1 : newFrame->m_tempLayer;
         // Adjust NAL type for unreferenced B frames (change from _R "referenced"
         // to _N "non-referenced" NAL unit type)
         switch (slice->m_nalUnitType)
         {
         case NAL_UNIT_CODED_SLICE_TRAIL_R:
-            slice->m_nalUnitType = m_bTemporalSublayer ? NAL_UNIT_CODED_SLICE_TSA_N : NAL_UNIT_CODED_SLICE_TRAIL_N;
+            slice->m_nalUnitType = newFrame->m_param->bEnableTemporalSubLayers ? NAL_UNIT_CODED_SLICE_TSA_N : NAL_UNIT_CODED_SLICE_TRAIL_N;
             break;
         case NAL_UNIT_CODED_SLICE_RADL_R:
             slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_RADL_N;
@@ -168,13 +177,94 @@
 
     m_picList.pushFront(*newFrame);
 
+    if (m_bTemporalSublayer && getTemporalLayerNonReferenceFlag())
+    {
+        switch (slice->m_nalUnitType)
+        {
+        case NAL_UNIT_CODED_SLICE_TRAIL_R:
+            slice->m_nalUnitType =  NAL_UNIT_CODED_SLICE_TRAIL_N;
+            break;
+        case NAL_UNIT_CODED_SLICE_RADL_R:
+            slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_RADL_N;
+            break;
+        case NAL_UNIT_CODED_SLICE_RASL_R:
+            slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_RASL_N;
+            break;
+        default:
+            break;
+        }
+    }
     // Do decoding refresh marking if any
     decodingRefreshMarking(pocCurr, slice->m_nalUnitType);
 
-    computeRPS(pocCurr, slice->isIRAP(), &slice->m_rps, slice->m_sps->maxDecPicBuffering);
-
+    computeRPS(pocCurr, newFrame->m_tempLayer, slice->isIRAP(), &slice->m_rps, slice->m_sps->maxDecPicBufferingnewFrame->m_tempLayer);
+    bool isTSAPic = ((slice->m_nalUnitType == 2) || (slice->m_nalUnitType == 3)) ? true : false;
     // Mark pictures in m_piclist as unreferenced if they are not included in RPS
-    applyReferencePictureSet(&slice->m_rps, pocCurr);
+    applyReferencePictureSet(&slice->m_rps, pocCurr, newFrame->m_tempLayer, isTSAPic);
+
+
+    if (m_bTemporalSublayer && newFrame->m_tempLayer > 0
+        && !(slice->m_nalUnitType == NAL_UNIT_CODED_SLICE_RADL_N     // Check if not a leading picture
+            || slice->m_nalUnitType == NAL_UNIT_CODED_SLICE_RADL_R
+            || slice->m_nalUnitType == NAL_UNIT_CODED_SLICE_RASL_N
+            || slice->m_nalUnitType == NAL_UNIT_CODED_SLICE_RASL_R)
+        )
+    {
+        if (isTemporalLayerSwitchingPoint(pocCurr, newFrame->m_tempLayer) || (slice->m_sps->maxTempSubLayers == 1))
+        {
+            if (getTemporalLayerNonReferenceFlag())
+            {
+                slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_TSA_N;
+            }
+            else
+            {
+                slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_TSA_R;
+            }
+        }
+        else if (isStepwiseTemporalLayerSwitchingPoint(&slice->m_rps, pocCurr, newFrame->m_tempLayer))
+        {
+            bool isSTSA = true;
+            int id = newFrame->m_gopOffset % x265_gop_ra_lengthnewFrame->m_gopId;
+            for (int ii = id; (ii < x265_gop_ra_lengthnewFrame->m_gopId && isSTSA == true); ii++)
+            {
+                int tempIdRef = x265_gop_ranewFrame->m_gopIdii.layer;
+                if (tempIdRef == newFrame->m_tempLayer)
+                {
+                    for (int jj = 0; jj < slice->m_rps.numberOfPositivePictures + slice->m_rps.numberOfNegativePictures; jj++)
+                    {
+                        if (slice->m_rps.bUsedjj)
+                        {
+                            int refPoc = x265_gop_ranewFrame->m_gopIdii.poc_offset + slice->m_rps.deltaPOCjj;
+                            int kk = 0;
+                            for (kk = 0; kk < x265_gop_ra_lengthnewFrame->m_gopId; kk++)
+                            {
+                                if (x265_gop_ranewFrame->m_gopIdkk.poc_offset == refPoc)
+                                {
+                                    break;
+                                }
+                            }
+                            if (x265_gop_ranewFrame->m_gopIdkk.layer >= newFrame->m_tempLayer)
+                            {
+                                isSTSA = false;
+                                break;
+                            }
+                        }
+                    }
+                }
+            }
+            if (isSTSA == true)
+            {
+                if (getTemporalLayerNonReferenceFlag())
+                {
+                    slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_STSA_N;
+                }
+                else
+                {
+                    slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_STSA_R;
+                }
+            }
+        }
+    }
 
     if (slice->m_sliceType != I_SLICE)
         slice->m_numRefIdx0 = x265_clip3(1, newFrame->m_param->maxNumReferences, slice->m_rps.numberOfNegativePictures);
@@ -218,7 +308,7 @@
     }
 }
 
-void DPB::computeRPS(int curPoc, bool isRAP, RPS * rps, unsigned int maxDecPicBuffer)
+void DPB::computeRPS(int curPoc, int tempId, bool isRAP, RPS * rps, unsigned int maxDecPicBuffer)
 {
     unsigned int poci = 0, numNeg = 0, numPos = 0;
 
@@ -228,7 +318,7 @@
     {
         if ((iterPic->m_poc != curPoc) && iterPic->m_encData->m_bHasReferences)
         {
-            if ((m_lastIDR >= curPoc) || (m_lastIDR <= iterPic->m_poc))
+            if ((!m_bTemporalSublayer || (iterPic->m_tempLayer <= tempId)) && ((m_lastIDR >= curPoc) || (m_lastIDR <= iterPic->m_poc)))
             {
                     rps->pocpoci = iterPic->m_poc;
                     rps->deltaPOCpoci = rps->pocpoci - curPoc;
@@ -247,6 +337,18 @@
     rps->sortDeltaPOC();
 }
 
+bool DPB::getTemporalLayerNonReferenceFlag()
+{
+    Frame* curFrame = m_picList.first();
+    if (curFrame->m_encData->m_bHasReferences)
+    {
+        curFrame->m_sameLayerRefPic = true;
+        return false;
+    }
+    else
+        return true;
+}
+
 /* Marking reference pictures when an IDR/CRA is encountered. */
 void DPB::decodingRefreshMarking(int pocCurr, NalUnitType nalUnitType)
 {
@@ -296,7 +398,7 @@
 }
 
 /** Function for applying picture marking based on the Reference Picture Set */
-void DPB::applyReferencePictureSet(RPS *rps, int curPoc)
+void DPB::applyReferencePictureSet(RPS *rps, int curPoc, int tempId, bool isTSAPicture)
 {
     // loop through all pictures in the reference picture buffer
     Frame* iterFrame = m_picList.first();
@@ -317,9 +419,68 @@
             }
             if (!referenced)
                 iterFrame->m_encData->m_bHasReferences = false;
+
+            if (m_bTemporalSublayer)
+            {
+                //check that pictures of higher temporal layers are not used
+                assert(referenced == 0 || iterFrame->m_encData->m_bHasReferences == false || iterFrame->m_tempLayer <= tempId);
+
+                //check that pictures of higher or equal temporal layer are not in the RPS if the current picture is a TSA picture
+                if (isTSAPicture)
+                {
+                    assert(referenced == 0 || iterFrame->m_tempLayer < tempId);
+                }
+                //check that pictures marked as temporal layer non-reference pictures are not used for reference
+                if (iterFrame->m_tempLayer == tempId)
+                {
+                    assert(referenced == 0 || iterFrame->m_sameLayerRefPic == true);
+                }
+            }
​

x265_3.5.tar.gz/source/encoder/dpb.h -> x265_3.6.tar.gz/source/encoder/dpb.h Changed

@@ -40,6 +40,7 @@
     int                m_lastIDR;
     int                m_pocCRA;
     int                m_bOpenGOP;
+	int                m_craNal;
     int                m_bhasLeadingPicture;
     bool               m_bRefreshPending;
     bool               m_bTemporalSublayer;
@@ -66,7 +67,8 @@
         m_bRefreshPending = false;
         m_frameDataFreeList = NULL;
         m_bOpenGOP = param->bOpenGOP;
-        m_bTemporalSublayer = !!param->bEnableTemporalSubLayers;
+		m_craNal = param->craNal;
+        m_bTemporalSublayer = (param->bEnableTemporalSubLayers > 2);
     }
 
     ~DPB();
@@ -77,10 +79,13 @@
 
 protected:
 
-    void computeRPS(int curPoc, bool isRAP, RPS * rps, unsigned int maxDecPicBuffer);
+    void computeRPS(int curPoc,int tempId, bool isRAP, RPS * rps, unsigned int maxDecPicBuffer);
 
-    void applyReferencePictureSet(RPS *rps, int curPoc);
+    void applyReferencePictureSet(RPS *rps, int curPoc, int tempId, bool isTSAPicture);
+    bool getTemporalLayerNonReferenceFlag();
     void decodingRefreshMarking(int pocCurr, NalUnitType nalUnitType);
+    bool isTemporalLayerSwitchingPoint(int curPoc, int tempId);
+    bool isStepwiseTemporalLayerSwitchingPoint(RPS *rps, int curPoc, int tempId);
 
     NalUnitType getNalUnitType(int curPoc, bool bIsKeyFrame);
 };

 
@@ -40,6 +40,7 @@
     int                m_lastIDR;
     int                m_pocCRA;
     int                m_bOpenGOP;
+   int                m_craNal;
     int                m_bhasLeadingPicture;
     bool               m_bRefreshPending;
     bool               m_bTemporalSublayer;
@@ -66,7 +67,8 @@
         m_bRefreshPending = false;
         m_frameDataFreeList = NULL;
         m_bOpenGOP = param->bOpenGOP;
-        m_bTemporalSublayer = !!param->bEnableTemporalSubLayers;
+       m_craNal = param->craNal;
+        m_bTemporalSublayer = (param->bEnableTemporalSubLayers > 2);
     }
 
     ~DPB();
@@ -77,10 +79,13 @@
 
 protected:
 
-    void computeRPS(int curPoc, bool isRAP, RPS * rps, unsigned int maxDecPicBuffer);
+    void computeRPS(int curPoc,int tempId, bool isRAP, RPS * rps, unsigned int maxDecPicBuffer);
 
-    void applyReferencePictureSet(RPS *rps, int curPoc);
+    void applyReferencePictureSet(RPS *rps, int curPoc, int tempId, bool isTSAPicture);
+    bool getTemporalLayerNonReferenceFlag();
     void decodingRefreshMarking(int pocCurr, NalUnitType nalUnitType);
+    bool isTemporalLayerSwitchingPoint(int curPoc, int tempId);
+    bool isStepwiseTemporalLayerSwitchingPoint(RPS *rps, int curPoc, int tempId);
 
     NalUnitType getNalUnitType(int curPoc, bool bIsKeyFrame);
 };
​

x265_3.5.tar.gz/source/encoder/encoder.cpp -> x265_3.6.tar.gz/source/encoder/encoder.cpp Changed

@@ -72,7 +72,40 @@
 {
     { 1, 1, 1, 1, 1, 5, 1,  2, 2, 2, 50 },
     { 1, 1, 1, 1, 1, 5, 0, 16, 9, 9, 81 },
-    { 1, 1, 1, 1, 1, 5, 0,  1, 1, 1, 82 }
+    { 1, 1, 1, 1, 1, 5, 0,  1, 1, 1, 82 },
+    { 1, 1, 1, 1, 1, 5, 0, 18, 9, 9, 84 }
+};
+
+typedef struct
+{
+    int bEnableVideoSignalTypePresentFlag;
+    int bEnableColorDescriptionPresentFlag;
+    int bEnableChromaLocInfoPresentFlag;
+    int colorPrimaries;
+    int transferCharacteristics;
+    int matrixCoeffs;
+    int bEnableVideoFullRangeFlag;
+    int chromaSampleLocTypeTopField;
+    int chromaSampleLocTypeBottomField;
+    const char* systemId;
+}VideoSignalTypePresets;
+
+VideoSignalTypePresets vstPresets =
+{
+    {1, 1, 1, 6, 6, 6, 0, 0, 0, "BT601_525"},
+    {1, 1, 1, 5, 6, 5, 0, 0, 0, "BT601_626"},
+    {1, 1, 1, 1, 1, 1, 0, 0, 0, "BT709_YCC"},
+    {1, 1, 0, 1, 1, 0, 0, 0, 0, "BT709_RGB"},
+    {1, 1, 1, 9, 14, 1, 0, 2, 2, "BT2020_YCC_NCL"},
+    {1, 1, 0, 9, 16, 9, 0, 0, 0, "BT2020_RGB"},
+    {1, 1, 1, 9, 16, 9, 0, 2, 2, "BT2100_PQ_YCC"},
+    {1, 1, 1, 9, 16, 14, 0, 2, 2, "BT2100_PQ_ICTCP"},
+    {1, 1, 0, 9, 16, 0, 0, 0, 0, "BT2100_PQ_RGB"},
+    {1, 1, 1, 9, 18, 9, 0, 2, 2, "BT2100_HLG_YCC"},
+    {1, 1, 0, 9, 18, 0, 0, 0, 0, "BT2100_HLG_RGB"},
+    {1, 1, 0, 1, 1, 0, 1, 0, 0, "FR709_RGB"},
+    {1, 1, 0, 9, 14, 0, 1, 0, 0, "FR2020_RGB"},
+    {1, 1, 1, 12, 1, 6, 1, 1, 1, "FRP3D65_YCC"}
 };
 }
 
@@ -109,6 +142,7 @@
     m_threadPool = NULL;
     m_analysisFileIn = NULL;
     m_analysisFileOut = NULL;
+    m_filmGrainIn = NULL;
     m_naluFile = NULL;
     m_offsetEmergency = NULL;
     m_iFrameNum = 0;
@@ -134,12 +168,8 @@
     m_prevTonemapPayload.payload = NULL;
     m_startPoint = 0;
     m_saveCTUSize = 0;
-    m_edgePic = NULL;
-    m_edgeHistThreshold = 0;
-    m_chromaHistThreshold = 0.0;
-    m_scaledEdgeThreshold = 0.0;
-    m_scaledChromaThreshold = 0.0;
     m_zoneIndex = 0;
+    m_origPicBuffer = 0;
 }
 
 inline char *strcatFilename(const char *input, const char *suffix)
@@ -216,34 +246,6 @@
         }
     }
 
-    if (m_param->bHistBasedSceneCut)
-    {
-        m_planeSizes0 = (m_param->sourceWidth >> x265_cli_cspsp->internalCsp.width0) * (m_param->sourceHeight >> x265_cli_cspsm_param->internalCsp.height0);
-        uint32_t pixelbytes = m_param->internalBitDepth > 8 ? 2 : 1;
-        m_edgePic = X265_MALLOC(pixel, m_planeSizes0 * pixelbytes);
-        m_edgeHistThreshold = m_param->edgeTransitionThreshold;
-        m_chromaHistThreshold = x265_min(m_edgeHistThreshold * 10.0, MAX_SCENECUT_THRESHOLD);
-        m_scaledEdgeThreshold = x265_min(m_edgeHistThreshold * SCENECUT_STRENGTH_FACTOR, MAX_SCENECUT_THRESHOLD);
-        m_scaledChromaThreshold = x265_min(m_chromaHistThreshold * SCENECUT_STRENGTH_FACTOR, MAX_SCENECUT_THRESHOLD);
-        if (m_param->sourceBitDepth != m_param->internalBitDepth)
-        {
-            int size = m_param->sourceWidth * m_param->sourceHeight;
-            int hshift = CHROMA_H_SHIFT(m_param->internalCsp);
-            int vshift = CHROMA_V_SHIFT(m_param->internalCsp);
-            int widthC = m_param->sourceWidth >> hshift;
-            int heightC = m_param->sourceHeight >> vshift;
-
-            m_inputPic0 = X265_MALLOC(pixel, size);
-            if (m_param->internalCsp != X265_CSP_I400)
-            {
-                for (int j = 1; j < 3; j++)
-                {
-                    m_inputPicj = X265_MALLOC(pixel, widthC * heightC);
-                }
-            }
-        }
-    }
-
     // Do not allow WPP if only one row or fewer than 3 columns, it is pointless and unstable
     if (rows == 1 || cols < 3)
     {
@@ -357,6 +359,10 @@
             lookAheadThreadPooli.start();
     m_lookahead->m_numPools = pools;
     m_dpb = new DPB(m_param);
+
+    if (m_param->bEnableTemporalFilter)
+        m_origPicBuffer = new OrigPicBuffer();
+
     m_rateControl = new RateControl(*m_param, this);
     if (!m_param->bResetZoneConfig)
     {
@@ -518,6 +524,15 @@
             }
         }
     }
+    if (m_param->filmGrain)
+    {
+        m_filmGrainIn = x265_fopen(m_param->filmGrain, "rb");
+        if (!m_filmGrainIn)
+        {
+            x265_log_file(NULL, X265_LOG_ERROR, "Failed to open film grain characteristics binary file %s\n", m_param->filmGrain);
+        }
+    }
+
     m_bZeroLatency = !m_param->bframes && !m_param->lookaheadDepth && m_param->frameNumThreads == 1 && m_param->maxSlices == 1;
     m_aborted |= parseLambdaFile(m_param);
 
@@ -879,26 +894,6 @@
         }
     }
 
-    if (m_param->bHistBasedSceneCut)
-    {
-        if (m_edgePic != NULL)
-        {
-            X265_FREE_ZERO(m_edgePic);
-        }
-
-        if (m_param->sourceBitDepth != m_param->internalBitDepth)
-        {
-            X265_FREE_ZERO(m_inputPic0);
-            if (m_param->internalCsp != X265_CSP_I400)
-            {
-                for (int i = 1; i < 3; i++)
-                {
-                    X265_FREE_ZERO(m_inputPici);
-                }
-            }
-        }
-    }
-
     for (int i = 0; i < m_param->frameNumThreads; i++)
     {
         if (m_frameEncoderi)
@@ -924,6 +919,10 @@
         delete zoneReadCount;
         delete zoneWriteCount;
     }
+
+    if (m_param->bEnableTemporalFilter)
+        delete m_origPicBuffer;
+
     if (m_rateControl)
     {
         m_rateControl->destroy();
@@ -963,6 +962,8 @@
      }
     if (m_naluFile)
         fclose(m_naluFile);
+    if (m_filmGrainIn)
+        x265_fclose(m_filmGrainIn);
 
 #ifdef SVT_HEVC
     X265_FREE(m_svtAppData);
@@ -974,6 +975,7 @@
         /* release string arguments that were strdup'd */
         free((char*)m_param->rc.lambdaFileName);
         free((char*)m_param->rc.statFileName);
+        free((char*)m_param->rc.sharedMemName);
         free((char*)m_param->analysisReuseFileName);
         free((char*)m_param->scalingLists);
         free((char*)m_param->csvfn);
@@ -982,6 +984,7 @@
         free((char*)m_param->toneMapFile);
         free((char*)m_param->analysisSave);
         free((char*)m_param->analysisLoad);
+        free((char*)m_param->videoSignalTypePreset);
         PARAM_NS::x265_param_free(m_param);
     }
 }
@@ -1358,215 +1361,90 @@
     dest->planes2 = (char*)dest->planes1 + src->stride1 * (src->height >> x265_cli_cspssrc->colorSpace.height1);
 }
 
-bool Encoder::computeHistograms(x265_picture *pic)
+bool Encoder::isFilterThisframe(uint8_t sliceTypeConfig, int curSliceType)
 {
-    pixel *src = NULL, *planeV = NULL, *planeU = NULL;
-    uint32_t widthC, heightC;
-    int hshift, vshift;
-

 
@@ -72,7 +72,40 @@
 {
     { 1, 1, 1, 1, 1, 5, 1,  2, 2, 2, 50 },
     { 1, 1, 1, 1, 1, 5, 0, 16, 9, 9, 81 },
-    { 1, 1, 1, 1, 1, 5, 0,  1, 1, 1, 82 }
+    { 1, 1, 1, 1, 1, 5, 0,  1, 1, 1, 82 },
+    { 1, 1, 1, 1, 1, 5, 0, 18, 9, 9, 84 }
+};
+
+typedef struct
+{
+    int bEnableVideoSignalTypePresentFlag;
+    int bEnableColorDescriptionPresentFlag;
+    int bEnableChromaLocInfoPresentFlag;
+    int colorPrimaries;
+    int transferCharacteristics;
+    int matrixCoeffs;
+    int bEnableVideoFullRangeFlag;
+    int chromaSampleLocTypeTopField;
+    int chromaSampleLocTypeBottomField;
+    const char* systemId;
+}VideoSignalTypePresets;
+
+VideoSignalTypePresets vstPresets =
+{
+    {1, 1, 1, 6, 6, 6, 0, 0, 0, "BT601_525"},
+    {1, 1, 1, 5, 6, 5, 0, 0, 0, "BT601_626"},
+    {1, 1, 1, 1, 1, 1, 0, 0, 0, "BT709_YCC"},
+    {1, 1, 0, 1, 1, 0, 0, 0, 0, "BT709_RGB"},
+    {1, 1, 1, 9, 14, 1, 0, 2, 2, "BT2020_YCC_NCL"},
+    {1, 1, 0, 9, 16, 9, 0, 0, 0, "BT2020_RGB"},
+    {1, 1, 1, 9, 16, 9, 0, 2, 2, "BT2100_PQ_YCC"},
+    {1, 1, 1, 9, 16, 14, 0, 2, 2, "BT2100_PQ_ICTCP"},
+    {1, 1, 0, 9, 16, 0, 0, 0, 0, "BT2100_PQ_RGB"},
+    {1, 1, 1, 9, 18, 9, 0, 2, 2, "BT2100_HLG_YCC"},
+    {1, 1, 0, 9, 18, 0, 0, 0, 0, "BT2100_HLG_RGB"},
+    {1, 1, 0, 1, 1, 0, 1, 0, 0, "FR709_RGB"},
+    {1, 1, 0, 9, 14, 0, 1, 0, 0, "FR2020_RGB"},
+    {1, 1, 1, 12, 1, 6, 1, 1, 1, "FRP3D65_YCC"}
 };
 }
 
@@ -109,6 +142,7 @@
     m_threadPool = NULL;
     m_analysisFileIn = NULL;
     m_analysisFileOut = NULL;
+    m_filmGrainIn = NULL;
     m_naluFile = NULL;
     m_offsetEmergency = NULL;
     m_iFrameNum = 0;
@@ -134,12 +168,8 @@
     m_prevTonemapPayload.payload = NULL;
     m_startPoint = 0;
     m_saveCTUSize = 0;
-    m_edgePic = NULL;
-    m_edgeHistThreshold = 0;
-    m_chromaHistThreshold = 0.0;
-    m_scaledEdgeThreshold = 0.0;
-    m_scaledChromaThreshold = 0.0;
     m_zoneIndex = 0;
+    m_origPicBuffer = 0;
 }
 
 inline char *strcatFilename(const char *input, const char *suffix)
@@ -216,34 +246,6 @@
         }
     }
 
-    if (m_param->bHistBasedSceneCut)
-    {
-        m_planeSizes0 = (m_param->sourceWidth >> x265_cli_cspsp->internalCsp.width0) * (m_param->sourceHeight >> x265_cli_cspsm_param->internalCsp.height0);
-        uint32_t pixelbytes = m_param->internalBitDepth > 8 ? 2 : 1;
-        m_edgePic = X265_MALLOC(pixel, m_planeSizes0 * pixelbytes);
-        m_edgeHistThreshold = m_param->edgeTransitionThreshold;
-        m_chromaHistThreshold = x265_min(m_edgeHistThreshold * 10.0, MAX_SCENECUT_THRESHOLD);
-        m_scaledEdgeThreshold = x265_min(m_edgeHistThreshold * SCENECUT_STRENGTH_FACTOR, MAX_SCENECUT_THRESHOLD);
-        m_scaledChromaThreshold = x265_min(m_chromaHistThreshold * SCENECUT_STRENGTH_FACTOR, MAX_SCENECUT_THRESHOLD);
-        if (m_param->sourceBitDepth != m_param->internalBitDepth)
-        {
-            int size = m_param->sourceWidth * m_param->sourceHeight;
-            int hshift = CHROMA_H_SHIFT(m_param->internalCsp);
-            int vshift = CHROMA_V_SHIFT(m_param->internalCsp);
-            int widthC = m_param->sourceWidth >> hshift;
-            int heightC = m_param->sourceHeight >> vshift;
-
-            m_inputPic0 = X265_MALLOC(pixel, size);
-            if (m_param->internalCsp != X265_CSP_I400)
-            {
-                for (int j = 1; j < 3; j++)
-                {
-                    m_inputPicj = X265_MALLOC(pixel, widthC * heightC);
-                }
-            }
-        }
-    }
-
     // Do not allow WPP if only one row or fewer than 3 columns, it is pointless and unstable
     if (rows == 1 || cols < 3)
     {
@@ -357,6 +359,10 @@
             lookAheadThreadPooli.start();
     m_lookahead->m_numPools = pools;
     m_dpb = new DPB(m_param);
+
+    if (m_param->bEnableTemporalFilter)
+        m_origPicBuffer = new OrigPicBuffer();
+
     m_rateControl = new RateControl(*m_param, this);
     if (!m_param->bResetZoneConfig)
     {
@@ -518,6 +524,15 @@
             }
         }
     }
+    if (m_param->filmGrain)
+    {
+        m_filmGrainIn = x265_fopen(m_param->filmGrain, "rb");
+        if (!m_filmGrainIn)
+        {
+            x265_log_file(NULL, X265_LOG_ERROR, "Failed to open film grain characteristics binary file %s\n", m_param->filmGrain);
+        }
+    }
+
     m_bZeroLatency = !m_param->bframes && !m_param->lookaheadDepth && m_param->frameNumThreads == 1 && m_param->maxSlices == 1;
     m_aborted |= parseLambdaFile(m_param);
 
@@ -879,26 +894,6 @@
         }
     }
 
-    if (m_param->bHistBasedSceneCut)
-    {
-        if (m_edgePic != NULL)
-        {
-            X265_FREE_ZERO(m_edgePic);
-        }
-
-        if (m_param->sourceBitDepth != m_param->internalBitDepth)
-        {
-            X265_FREE_ZERO(m_inputPic0);
-            if (m_param->internalCsp != X265_CSP_I400)
-            {
-                for (int i = 1; i < 3; i++)
-                {
-                    X265_FREE_ZERO(m_inputPici);
-                }
-            }
-        }
-    }
-
     for (int i = 0; i < m_param->frameNumThreads; i++)
     {
         if (m_frameEncoderi)
@@ -924,6 +919,10 @@
         delete zoneReadCount;
         delete zoneWriteCount;
     }
+
+    if (m_param->bEnableTemporalFilter)
+        delete m_origPicBuffer;
+
     if (m_rateControl)
     {
         m_rateControl->destroy();
@@ -963,6 +962,8 @@
      }
     if (m_naluFile)
         fclose(m_naluFile);
+    if (m_filmGrainIn)
+        x265_fclose(m_filmGrainIn);
 
 #ifdef SVT_HEVC
     X265_FREE(m_svtAppData);
@@ -974,6 +975,7 @@
         /* release string arguments that were strdup'd */
         free((char*)m_param->rc.lambdaFileName);
         free((char*)m_param->rc.statFileName);
+        free((char*)m_param->rc.sharedMemName);
         free((char*)m_param->analysisReuseFileName);
         free((char*)m_param->scalingLists);
         free((char*)m_param->csvfn);
@@ -982,6 +984,7 @@
         free((char*)m_param->toneMapFile);
         free((char*)m_param->analysisSave);
         free((char*)m_param->analysisLoad);
+        free((char*)m_param->videoSignalTypePreset);
         PARAM_NS::x265_param_free(m_param);
     }
 }
@@ -1358,215 +1361,90 @@
     dest->planes2 = (char*)dest->planes1 + src->stride1 * (src->height >> x265_cli_cspssrc->colorSpace.height1);
 }
 
-bool Encoder::computeHistograms(x265_picture *pic)
+bool Encoder::isFilterThisframe(uint8_t sliceTypeConfig, int curSliceType)
 {
-    pixel *src = NULL, *planeV = NULL, *planeU = NULL;
-    uint32_t widthC, heightC;
-    int hshift, vshift;
-
​

x265_3.5.tar.gz/source/encoder/encoder.h -> x265_3.6.tar.gz/source/encoder/encoder.h Changed

@@ -32,6 +32,7 @@
 #include "nal.h"
 #include "framedata.h"
 #include "svt.h"
+#include "temporalfilter.h"
 #ifdef ENABLE_HDR10_PLUS
     #include "dynamicHDR10/hdr10plus.h"
 #endif
@@ -256,19 +257,6 @@
     int                m_bToneMap; // Enables tone-mapping
     int                m_enableNal;
 
-    /* For histogram based scene-cut detection */
-    pixel*             m_edgePic;
-    pixel*             m_inputPic3;
-    int32_t            m_curYUVHist3HISTOGRAM_BINS;
-    int32_t            m_prevYUVHist3HISTOGRAM_BINS;
-    int32_t            m_curEdgeHist2;
-    int32_t            m_prevEdgeHist2;
-    uint32_t           m_planeSizes3;
-    double             m_edgeHistThreshold;
-    double             m_chromaHistThreshold;
-    double             m_scaledEdgeThreshold;
-    double             m_scaledChromaThreshold;
-
 #ifdef ENABLE_HDR10_PLUS
     const hdr10plus_api     *m_hdr10plus_api;
     uint8_t                 **m_cim;
@@ -295,6 +283,9 @@
 
     ThreadSafeInteger* zoneReadCount;
     ThreadSafeInteger* zoneWriteCount;
+    /* Film grain model file */
+    FILE* m_filmGrainIn;
+    OrigPicBuffer*          m_origPicBuffer;
 
     Encoder();
     ~Encoder()
@@ -327,6 +318,8 @@
 
     void getStreamHeaders(NALList& list, Entropy& sbacCoder, Bitstream& bs);
 
+    void getEndNalUnits(NALList& list, Bitstream& bs);
+
     void fetchStats(x265_stats* stats, size_t statsSizeBytes);
 
     void printSummary();
@@ -373,11 +366,6 @@
 
     void copyPicture(x265_picture *dest, const x265_picture *src);
 
-    bool computeHistograms(x265_picture *pic);
-    void computeHistogramSAD(double *maxUVNormalizedSAD, double *edgeNormalizedSAD, int curPoc);
-    double normalizeRange(int32_t value, int32_t minValue, int32_t maxValue, double rangeStart, double rangeEnd);
-    void findSceneCuts(x265_picture *pic, bool& bDup, double m_maxUVSADVal, double m_edgeSADVal, bool& isMaxThres, bool& isHardSC);
-
     void initRefIdx();
     void analyseRefIdx(int *numRefIdx);
     void updateRefIdx();
@@ -387,6 +375,11 @@
 
     void configureDolbyVisionParams(x265_param* p);
 
+    void configureVideoSignalTypePreset(x265_param* p);
+
+    bool isFilterThisframe(uint8_t sliceTypeConfig, int curSliceType);
+    bool generateMcstfRef(Frame* frameEnc, FrameEncoder* currEncoder);
+
 protected:
 
     void initVPS(VPS *vps);

 
@@ -32,6 +32,7 @@
 #include "nal.h"
 #include "framedata.h"
 #include "svt.h"
+#include "temporalfilter.h"
 #ifdef ENABLE_HDR10_PLUS
     #include "dynamicHDR10/hdr10plus.h"
 #endif
@@ -256,19 +257,6 @@
     int                m_bToneMap; // Enables tone-mapping
     int                m_enableNal;
 
-    /* For histogram based scene-cut detection */
-    pixel*             m_edgePic;
-    pixel*             m_inputPic3;
-    int32_t            m_curYUVHist3HISTOGRAM_BINS;
-    int32_t            m_prevYUVHist3HISTOGRAM_BINS;
-    int32_t            m_curEdgeHist2;
-    int32_t            m_prevEdgeHist2;
-    uint32_t           m_planeSizes3;
-    double             m_edgeHistThreshold;
-    double             m_chromaHistThreshold;
-    double             m_scaledEdgeThreshold;
-    double             m_scaledChromaThreshold;
-
 #ifdef ENABLE_HDR10_PLUS
     const hdr10plus_api     *m_hdr10plus_api;
     uint8_t                 **m_cim;
@@ -295,6 +283,9 @@
 
     ThreadSafeInteger* zoneReadCount;
     ThreadSafeInteger* zoneWriteCount;
+    /* Film grain model file */
+    FILE* m_filmGrainIn;
+    OrigPicBuffer*          m_origPicBuffer;
 
     Encoder();
     ~Encoder()
@@ -327,6 +318,8 @@
 
     void getStreamHeaders(NALList& list, Entropy& sbacCoder, Bitstream& bs);
 
+    void getEndNalUnits(NALList& list, Bitstream& bs);
+
     void fetchStats(x265_stats* stats, size_t statsSizeBytes);
 
     void printSummary();
@@ -373,11 +366,6 @@
 
     void copyPicture(x265_picture *dest, const x265_picture *src);
 
-    bool computeHistograms(x265_picture *pic);
-    void computeHistogramSAD(double *maxUVNormalizedSAD, double *edgeNormalizedSAD, int curPoc);
-    double normalizeRange(int32_t value, int32_t minValue, int32_t maxValue, double rangeStart, double rangeEnd);
-    void findSceneCuts(x265_picture *pic, bool& bDup, double m_maxUVSADVal, double m_edgeSADVal, bool& isMaxThres, bool& isHardSC);
-
     void initRefIdx();
     void analyseRefIdx(int *numRefIdx);
     void updateRefIdx();
@@ -387,6 +375,11 @@
 
     void configureDolbyVisionParams(x265_param* p);
 
+    void configureVideoSignalTypePreset(x265_param* p);
+
+    bool isFilterThisframe(uint8_t sliceTypeConfig, int curSliceType);
+    bool generateMcstfRef(Frame* frameEnc, FrameEncoder* currEncoder);
+
 protected:
 
     void initVPS(VPS *vps);
​

x265_3.5.tar.gz/source/encoder/entropy.cpp -> x265_3.6.tar.gz/source/encoder/entropy.cpp Changed

@@ -245,9 +245,9 @@
 
     for (uint32_t i = 0; i < vps.maxTempSubLayers; i++)
     {
-        WRITE_UVLC(vps.maxDecPicBuffering - 1, "vps_max_dec_pic_buffering_minus1i");
-        WRITE_UVLC(vps.numReorderPics,         "vps_num_reorder_picsi");
-        WRITE_UVLC(vps.maxLatencyIncrease + 1, "vps_max_latency_increase_plus1i");
+        WRITE_UVLC(vps.maxDecPicBufferingi - 1, "vps_max_dec_pic_buffering_minus1i");
+        WRITE_UVLC(vps.numReorderPicsi,         "vps_num_reorder_picsi");
+        WRITE_UVLC(vps.maxLatencyIncreasei + 1, "vps_max_latency_increase_plus1i");
     }
 
     WRITE_CODE(0, 6, "vps_max_nuh_reserved_zero_layer_id");
@@ -291,9 +291,9 @@
 
     for (uint32_t i = 0; i < sps.maxTempSubLayers; i++)
     {
-        WRITE_UVLC(sps.maxDecPicBuffering - 1, "sps_max_dec_pic_buffering_minus1i");
-        WRITE_UVLC(sps.numReorderPics,         "sps_num_reorder_picsi");
-        WRITE_UVLC(sps.maxLatencyIncrease + 1, "sps_max_latency_increase_plus1i");
+        WRITE_UVLC(sps.maxDecPicBufferingi - 1, "sps_max_dec_pic_buffering_minus1i");
+        WRITE_UVLC(sps.numReorderPicsi,         "sps_num_reorder_picsi");
+        WRITE_UVLC(sps.maxLatencyIncreasei + 1, "sps_max_latency_increase_plus1i");
     }
 
     WRITE_UVLC(sps.log2MinCodingBlockSize - 3,    "log2_min_coding_block_size_minus3");
@@ -418,8 +418,11 @@
 
     if (maxTempSubLayers > 1)
     {
-         WRITE_FLAG(0, "sub_layer_profile_present_flagi");
-         WRITE_FLAG(0, "sub_layer_level_present_flagi");
+        for(int i = 0; i < maxTempSubLayers - 1; i++)
+        {
+            WRITE_FLAG(0, "sub_layer_profile_present_flagi");
+            WRITE_FLAG(0, "sub_layer_level_present_flagi");
+        }
          for (int i = maxTempSubLayers - 1; i < 8 ; i++)
              WRITE_CODE(0, 2, "reserved_zero_2bits");
     }

 
@@ -245,9 +245,9 @@
 
     for (uint32_t i = 0; i < vps.maxTempSubLayers; i++)
     {
-        WRITE_UVLC(vps.maxDecPicBuffering - 1, "vps_max_dec_pic_buffering_minus1i");
-        WRITE_UVLC(vps.numReorderPics,         "vps_num_reorder_picsi");
-        WRITE_UVLC(vps.maxLatencyIncrease + 1, "vps_max_latency_increase_plus1i");
+        WRITE_UVLC(vps.maxDecPicBufferingi - 1, "vps_max_dec_pic_buffering_minus1i");
+        WRITE_UVLC(vps.numReorderPicsi,         "vps_num_reorder_picsi");
+        WRITE_UVLC(vps.maxLatencyIncreasei + 1, "vps_max_latency_increase_plus1i");
     }
 
     WRITE_CODE(0, 6, "vps_max_nuh_reserved_zero_layer_id");
@@ -291,9 +291,9 @@
 
     for (uint32_t i = 0; i < sps.maxTempSubLayers; i++)
     {
-        WRITE_UVLC(sps.maxDecPicBuffering - 1, "sps_max_dec_pic_buffering_minus1i");
-        WRITE_UVLC(sps.numReorderPics,         "sps_num_reorder_picsi");
-        WRITE_UVLC(sps.maxLatencyIncrease + 1, "sps_max_latency_increase_plus1i");
+        WRITE_UVLC(sps.maxDecPicBufferingi - 1, "sps_max_dec_pic_buffering_minus1i");
+        WRITE_UVLC(sps.numReorderPicsi,         "sps_num_reorder_picsi");
+        WRITE_UVLC(sps.maxLatencyIncreasei + 1, "sps_max_latency_increase_plus1i");
     }
 
     WRITE_UVLC(sps.log2MinCodingBlockSize - 3,    "log2_min_coding_block_size_minus3");
@@ -418,8 +418,11 @@
 
     if (maxTempSubLayers > 1)
     {
-         WRITE_FLAG(0, "sub_layer_profile_present_flagi");
-         WRITE_FLAG(0, "sub_layer_level_present_flagi");
+        for(int i = 0; i < maxTempSubLayers - 1; i++)
+        {
+            WRITE_FLAG(0, "sub_layer_profile_present_flagi");
+            WRITE_FLAG(0, "sub_layer_level_present_flagi");
+        }
          for (int i = maxTempSubLayers - 1; i < 8 ; i++)
              WRITE_CODE(0, 2, "reserved_zero_2bits");
     }
​

x265_3.5.tar.gz/source/encoder/frameencoder.cpp -> x265_3.6.tar.gz/source/encoder/frameencoder.cpp Changed

@@ -34,6 +34,7 @@
 #include "common.h"
 #include "slicetype.h"
 #include "nal.h"
+#include "temporalfilter.h"
 
 namespace X265_NS {
 void weightAnalyse(Slice& slice, Frame& frame, x265_param& param);
@@ -101,6 +102,16 @@
         delete m_rce.picTimingSEI;
         delete m_rce.hrdTiming;
     }
+
+    if (m_param->bEnableTemporalFilter)
+    {
+        delete m_frameEncTF->m_metld;
+
+        for (int i = 0; i < (m_frameEncTF->m_range << 1); i++)
+            m_frameEncTF->destroyRefPicInfo(&m_mcstfRefListi);
+
+        delete m_frameEncTF;
+    }
 }
 
 bool FrameEncoder::init(Encoder *top, int numRows, int numCols)
@@ -195,6 +206,16 @@
         m_sliceAddrBits = (uint16_t)(tmp + 1);
     }
 
+    if (m_param->bEnableTemporalFilter)
+    {
+        m_frameEncTF = new TemporalFilter();
+        if (m_frameEncTF)
+            m_frameEncTF->init(m_param);
+
+        for (int i = 0; i < (m_frameEncTF->m_range << 1); i++)
+            ok &= !!m_frameEncTF->createRefPicInfo(&m_mcstfRefListi, m_param);
+    }
+
     return ok;
 }
 
@@ -450,7 +471,7 @@
     m_ssimCnt = 0;
     memset(&(m_frame->m_encData->m_frameStats), 0, sizeof(m_frame->m_encData->m_frameStats));
 
-    if (!m_param->bHistBasedSceneCut && m_param->rc.aqMode != X265_AQ_EDGE && m_param->recursionSkipMode == EDGE_BASED_RSKIP)
+    if (m_param->rc.aqMode != X265_AQ_EDGE && m_param->recursionSkipMode == EDGE_BASED_RSKIP)
     {
         int height = m_frame->m_fencPic->m_picHeight;
         int width = m_frame->m_fencPic->m_picWidth;
@@ -467,6 +488,12 @@
      * unit) */
     Slice* slice = m_frame->m_encData->m_slice;
 
+    if (m_param->bEnableEndOfSequence && m_frame->m_lowres.sliceType == X265_TYPE_IDR && m_frame->m_poc)
+    {
+        m_bs.resetBits();
+        m_nalList.serialize(NAL_UNIT_EOS, m_bs);
+    }
+
     if (m_param->bEnableAccessUnitDelimiters && (m_frame->m_poc || m_param->bRepeatHeaders))
     {
         m_bs.resetBits();
@@ -573,6 +600,12 @@
     int qp = m_top->m_rateControl->rateControlStart(m_frame, &m_rce, m_top);
     m_rce.newQp = qp;
 
+    if (m_param->bEnableTemporalFilter)
+    {
+        m_frameEncTF->m_QP = qp;
+        m_frameEncTF->bilateralFilter(m_frame, m_mcstfRefList, m_param->temporalFilterStrength);
+    }
+
     if (m_nr)
     {
         if (qp > QP_MAX_SPEC && m_frame->m_param->rc.vbvBufferSize)
@@ -744,7 +777,7 @@
             // wait after removal of the access unit with the most recent
             // buffering period SEI message
             sei->m_auCpbRemovalDelay = X265_MIN(X265_MAX(1, m_rce.encodeOrder - prevBPSEI), (1 << hrd->cpbRemovalDelayLength));
-            sei->m_picDpbOutputDelay = slice->m_sps->numReorderPics + poc - m_rce.encodeOrder;
+            sei->m_picDpbOutputDelay = slice->m_sps->numReorderPicsm_frame->m_tempLayer + poc - m_rce.encodeOrder;
         }
 
         sei->writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_PREFIX_SEI, m_nalList, m_param->bSingleSeiNal);
@@ -756,7 +789,14 @@
         m_seiAlternativeTC.m_preferredTransferCharacteristics = m_param->preferredTransferCharacteristics;
         m_seiAlternativeTC.writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_PREFIX_SEI, m_nalList, m_param->bSingleSeiNal);
     }
-
+    /* Write Film grain characteristics if present */
+    if (this->m_top->m_filmGrainIn)
+    {
+        FilmGrainCharacteristics m_filmGrain;
+        /* Read the Film grain model file */
+        readModel(&m_filmGrain, this->m_top->m_filmGrainIn);
+        m_filmGrain.writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_PREFIX_SEI, m_nalList, m_param->bSingleSeiNal);
+    }
     /* Write user SEI */
     for (int i = 0; i < m_frame->m_userSEI.numPayloads; i++)
     {
@@ -933,6 +973,23 @@
     if (m_param->bDynamicRefine && m_top->m_startPoint <= m_frame->m_encodeOrder) //Avoid collecting data that will not be used by future frames.
         collectDynDataFrame();
 
+    if (m_param->bEnableTemporalFilter && m_top->isFilterThisframe(m_frame->m_mcstf->m_sliceTypeConfig, m_frame->m_lowres.sliceType))
+    {
+        //Reset the MCSTF context in Frame Encoder and Frame
+        for (int i = 0; i < (m_frameEncTF->m_range << 1); i++)
+        {
+            memset(m_mcstfRefListi.mvs0, 0, sizeof(MV) * ((m_param->sourceWidth / 16) * (m_param->sourceHeight / 16)));
+            memset(m_mcstfRefListi.mvs1, 0, sizeof(MV) * ((m_param->sourceWidth / 16) * (m_param->sourceHeight / 16)));
+            memset(m_mcstfRefListi.mvs2, 0, sizeof(MV) * ((m_param->sourceWidth / 16) * (m_param->sourceHeight / 16)));
+            memset(m_mcstfRefListi.mvs,  0, sizeof(MV) * ((m_param->sourceWidth / 4) * (m_param->sourceHeight / 4)));
+            memset(m_mcstfRefListi.noise, 0, sizeof(int) * ((m_param->sourceWidth / 4) * (m_param->sourceHeight / 4)));
+            memset(m_mcstfRefListi.error, 0, sizeof(int) * ((m_param->sourceWidth / 4) * (m_param->sourceHeight / 4)));
+
+            m_frame->m_mcstf->m_numRef = 0;
+        }
+    }
+
+
     if (m_param->rc.bStatWrite)
     {
         int totalI = 0, totalP = 0, totalSkip = 0;
@@ -1041,7 +1098,7 @@
             
             m_bs.writeByteAlignment();
 
-            m_nalList.serialize(slice->m_nalUnitType, m_bs);
+            m_nalList.serialize(slice->m_nalUnitType, m_bs, (!!m_param->bEnableTemporalSubLayers ? m_frame->m_tempLayer + 1 : (1 + (slice->m_nalUnitType == NAL_UNIT_CODED_SLICE_TSA_N))));
         }
     }
     else
@@ -1062,7 +1119,7 @@
             m_entropyCoder.codeSliceHeaderWPPEntryPoints(m_substreamSizes, (slice->m_sps->numCuInHeight - 1), maxStreamSize);
         m_bs.writeByteAlignment();
 
-        m_nalList.serialize(slice->m_nalUnitType, m_bs);
+        m_nalList.serialize(slice->m_nalUnitType, m_bs, (!!m_param->bEnableTemporalSubLayers ? m_frame->m_tempLayer + 1 : (1 + (slice->m_nalUnitType == NAL_UNIT_CODED_SLICE_TSA_N))));
     }
 
     if (m_param->decodedPictureHashSEI)
@@ -2127,6 +2184,54 @@
         m_nr->nrOffsetDenoisecat0 = 0;
     }
 }
+
+void FrameEncoder::readModel(FilmGrainCharacteristics* m_filmGrain, FILE* filmgrain)
+{
+    char const* errorMessage = "Error reading FilmGrain characteristics\n";
+    FilmGrain m_fg;
+    x265_fread((char* )&m_fg, sizeof(bool) * 3 + sizeof(uint8_t), 1, filmgrain, errorMessage);
+    m_filmGrain->m_filmGrainCharacteristicsCancelFlag = m_fg.m_filmGrainCharacteristicsCancelFlag;
+    m_filmGrain->m_filmGrainCharacteristicsPersistenceFlag = m_fg.m_filmGrainCharacteristicsPersistenceFlag;
+    m_filmGrain->m_filmGrainModelId = m_fg.m_filmGrainModelId;
+    m_filmGrain->m_separateColourDescriptionPresentFlag = m_fg.m_separateColourDescriptionPresentFlag;
+    if (m_filmGrain->m_separateColourDescriptionPresentFlag)
+    {
+        ColourDescription m_clr;
+        x265_fread((char* )&m_clr, sizeof(bool) + sizeof(uint8_t) * 5, 1, filmgrain, errorMessage);
+        m_filmGrain->m_filmGrainBitDepthLumaMinus8 = m_clr.m_filmGrainBitDepthLumaMinus8;
+        m_filmGrain->m_filmGrainBitDepthChromaMinus8 = m_clr.m_filmGrainBitDepthChromaMinus8;
+        m_filmGrain->m_filmGrainFullRangeFlag = m_clr.m_filmGrainFullRangeFlag;
+        m_filmGrain->m_filmGrainColourPrimaries = m_clr.m_filmGrainColourPrimaries;
+        m_filmGrain->m_filmGrainTransferCharacteristics = m_clr.m_filmGrainTransferCharacteristics;
+        m_filmGrain->m_filmGrainMatrixCoeffs = m_clr.m_filmGrainMatrixCoeffs;
+    }
+    FGPresent m_present;
+    x265_fread((char* )&m_present, sizeof(bool) * 3 + sizeof(uint8_t) * 2, 1, filmgrain, errorMessage);
+    m_filmGrain->m_blendingModeId = m_present.m_blendingModeId;
+    m_filmGrain->m_log2ScaleFactor = m_present.m_log2ScaleFactor;
+    m_filmGrain->m_compModel0.bPresentFlag = m_present.m_presentFlag0;
+    m_filmGrain->m_compModel1.bPresentFlag = m_present.m_presentFlag1;
+    m_filmGrain->m_compModel2.bPresentFlag = m_present.m_presentFlag2;
+    for (int i = 0; i < MAX_NUM_COMPONENT; i++)
+    {
+        if (m_filmGrain->m_compModeli.bPresentFlag)
+        {
+            x265_fread((char* )(&m_filmGrain->m_compModeli.m_filmGrainNumIntensityIntervalMinus1), sizeof(uint8_t), 1, filmgrain, errorMessage);
+            x265_fread((char* )(&m_filmGrain->m_compModeli.numModelValues), sizeof(uint8_t), 1, filmgrain, errorMessage);
+            m_filmGrain->m_compModeli.intensityValues = (FilmGrainCharacteristics::CompModelIntensityValues* ) malloc(sizeof(FilmGrainCharacteristics::CompModelIntensityValues) * (m_filmGrain->m_compModeli.m_filmGrainNumIntensityIntervalMinus1+1)) ;
+            for (int j = 0; j <= m_filmGrain->m_compModeli.m_filmGrainNumIntensityIntervalMinus1; j++)
+            {
+                x265_fread((char* )(&m_filmGrain->m_compModeli.intensityValuesj.intensityIntervalLowerBound), sizeof(uint8_t), 1, filmgrain, errorMessage);
+                x265_fread((char* )(&m_filmGrain->m_compModeli.intensityValuesj.intensityIntervalUpperBound), sizeof(uint8_t), 1, filmgrain, errorMessage);
+                m_filmGrain->m_compModeli.intensityValuesj.compModelValue = (int* ) malloc(sizeof(int) * (m_filmGrain->m_compModeli.numModelValues));
+                for (int k = 0; k < m_filmGrain->m_compModeli.numModelValues; k++)
+                {
+                    x265_fread((char* )(&m_filmGrain->m_compModeli.intensityValuesj.compModelValuek), sizeof(int), 1, filmgrain, errorMessage);
+                }
+            }
+        }
+    }
+}
 #if ENABLE_LIBVMAF
 void FrameEncoder::vmafFrameLevelScore()
 {

 
@@ -34,6 +34,7 @@
 #include "common.h"
 #include "slicetype.h"
 #include "nal.h"
+#include "temporalfilter.h"
 
 namespace X265_NS {
 void weightAnalyse(Slice& slice, Frame& frame, x265_param& param);
@@ -101,6 +102,16 @@
         delete m_rce.picTimingSEI;
         delete m_rce.hrdTiming;
     }
+
+    if (m_param->bEnableTemporalFilter)
+    {
+        delete m_frameEncTF->m_metld;
+
+        for (int i = 0; i < (m_frameEncTF->m_range << 1); i++)
+            m_frameEncTF->destroyRefPicInfo(&m_mcstfRefListi);
+
+        delete m_frameEncTF;
+    }
 }
 
 bool FrameEncoder::init(Encoder *top, int numRows, int numCols)
@@ -195,6 +206,16 @@
         m_sliceAddrBits = (uint16_t)(tmp + 1);
     }
 
+    if (m_param->bEnableTemporalFilter)
+    {
+        m_frameEncTF = new TemporalFilter();
+        if (m_frameEncTF)
+            m_frameEncTF->init(m_param);
+
+        for (int i = 0; i < (m_frameEncTF->m_range << 1); i++)
+            ok &= !!m_frameEncTF->createRefPicInfo(&m_mcstfRefListi, m_param);
+    }
+
     return ok;
 }
 
@@ -450,7 +471,7 @@
     m_ssimCnt = 0;
     memset(&(m_frame->m_encData->m_frameStats), 0, sizeof(m_frame->m_encData->m_frameStats));
 
-    if (!m_param->bHistBasedSceneCut && m_param->rc.aqMode != X265_AQ_EDGE && m_param->recursionSkipMode == EDGE_BASED_RSKIP)
+    if (m_param->rc.aqMode != X265_AQ_EDGE && m_param->recursionSkipMode == EDGE_BASED_RSKIP)
     {
         int height = m_frame->m_fencPic->m_picHeight;
         int width = m_frame->m_fencPic->m_picWidth;
@@ -467,6 +488,12 @@
      * unit) */
     Slice* slice = m_frame->m_encData->m_slice;
 
+    if (m_param->bEnableEndOfSequence && m_frame->m_lowres.sliceType == X265_TYPE_IDR && m_frame->m_poc)
+    {
+        m_bs.resetBits();
+        m_nalList.serialize(NAL_UNIT_EOS, m_bs);
+    }
+
     if (m_param->bEnableAccessUnitDelimiters && (m_frame->m_poc || m_param->bRepeatHeaders))
     {
         m_bs.resetBits();
@@ -573,6 +600,12 @@
     int qp = m_top->m_rateControl->rateControlStart(m_frame, &m_rce, m_top);
     m_rce.newQp = qp;
 
+    if (m_param->bEnableTemporalFilter)
+    {
+        m_frameEncTF->m_QP = qp;
+        m_frameEncTF->bilateralFilter(m_frame, m_mcstfRefList, m_param->temporalFilterStrength);
+    }
+
     if (m_nr)
     {
         if (qp > QP_MAX_SPEC && m_frame->m_param->rc.vbvBufferSize)
@@ -744,7 +777,7 @@
             // wait after removal of the access unit with the most recent
             // buffering period SEI message
             sei->m_auCpbRemovalDelay = X265_MIN(X265_MAX(1, m_rce.encodeOrder - prevBPSEI), (1 << hrd->cpbRemovalDelayLength));
-            sei->m_picDpbOutputDelay = slice->m_sps->numReorderPics + poc - m_rce.encodeOrder;
+            sei->m_picDpbOutputDelay = slice->m_sps->numReorderPicsm_frame->m_tempLayer + poc - m_rce.encodeOrder;
         }
 
         sei->writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_PREFIX_SEI, m_nalList, m_param->bSingleSeiNal);
@@ -756,7 +789,14 @@
         m_seiAlternativeTC.m_preferredTransferCharacteristics = m_param->preferredTransferCharacteristics;
         m_seiAlternativeTC.writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_PREFIX_SEI, m_nalList, m_param->bSingleSeiNal);
     }
-
+    /* Write Film grain characteristics if present */
+    if (this->m_top->m_filmGrainIn)
+    {
+        FilmGrainCharacteristics m_filmGrain;
+        /* Read the Film grain model file */
+        readModel(&m_filmGrain, this->m_top->m_filmGrainIn);
+        m_filmGrain.writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_PREFIX_SEI, m_nalList, m_param->bSingleSeiNal);
+    }
     /* Write user SEI */
     for (int i = 0; i < m_frame->m_userSEI.numPayloads; i++)
     {
@@ -933,6 +973,23 @@
     if (m_param->bDynamicRefine && m_top->m_startPoint <= m_frame->m_encodeOrder) //Avoid collecting data that will not be used by future frames.
         collectDynDataFrame();
 
+    if (m_param->bEnableTemporalFilter && m_top->isFilterThisframe(m_frame->m_mcstf->m_sliceTypeConfig, m_frame->m_lowres.sliceType))
+    {
+        //Reset the MCSTF context in Frame Encoder and Frame
+        for (int i = 0; i < (m_frameEncTF->m_range << 1); i++)
+        {
+            memset(m_mcstfRefListi.mvs0, 0, sizeof(MV) * ((m_param->sourceWidth / 16) * (m_param->sourceHeight / 16)));
+            memset(m_mcstfRefListi.mvs1, 0, sizeof(MV) * ((m_param->sourceWidth / 16) * (m_param->sourceHeight / 16)));
+            memset(m_mcstfRefListi.mvs2, 0, sizeof(MV) * ((m_param->sourceWidth / 16) * (m_param->sourceHeight / 16)));
+            memset(m_mcstfRefListi.mvs,  0, sizeof(MV) * ((m_param->sourceWidth / 4) * (m_param->sourceHeight / 4)));
+            memset(m_mcstfRefListi.noise, 0, sizeof(int) * ((m_param->sourceWidth / 4) * (m_param->sourceHeight / 4)));
+            memset(m_mcstfRefListi.error, 0, sizeof(int) * ((m_param->sourceWidth / 4) * (m_param->sourceHeight / 4)));
+
+            m_frame->m_mcstf->m_numRef = 0;
+        }
+    }
+
+
     if (m_param->rc.bStatWrite)
     {
         int totalI = 0, totalP = 0, totalSkip = 0;
@@ -1041,7 +1098,7 @@
             
             m_bs.writeByteAlignment();
 
-            m_nalList.serialize(slice->m_nalUnitType, m_bs);
+            m_nalList.serialize(slice->m_nalUnitType, m_bs, (!!m_param->bEnableTemporalSubLayers ? m_frame->m_tempLayer + 1 : (1 + (slice->m_nalUnitType == NAL_UNIT_CODED_SLICE_TSA_N))));
         }
     }
     else
@@ -1062,7 +1119,7 @@
             m_entropyCoder.codeSliceHeaderWPPEntryPoints(m_substreamSizes, (slice->m_sps->numCuInHeight - 1), maxStreamSize);
         m_bs.writeByteAlignment();
 
-        m_nalList.serialize(slice->m_nalUnitType, m_bs);
+        m_nalList.serialize(slice->m_nalUnitType, m_bs, (!!m_param->bEnableTemporalSubLayers ? m_frame->m_tempLayer + 1 : (1 + (slice->m_nalUnitType == NAL_UNIT_CODED_SLICE_TSA_N))));
     }
 
     if (m_param->decodedPictureHashSEI)
@@ -2127,6 +2184,54 @@
         m_nr->nrOffsetDenoisecat0 = 0;
     }
 }
+
+void FrameEncoder::readModel(FilmGrainCharacteristics* m_filmGrain, FILE* filmgrain)
+{
+    char const* errorMessage = "Error reading FilmGrain characteristics\n";
+    FilmGrain m_fg;
+    x265_fread((char* )&m_fg, sizeof(bool) * 3 + sizeof(uint8_t), 1, filmgrain, errorMessage);
+    m_filmGrain->m_filmGrainCharacteristicsCancelFlag = m_fg.m_filmGrainCharacteristicsCancelFlag;
+    m_filmGrain->m_filmGrainCharacteristicsPersistenceFlag = m_fg.m_filmGrainCharacteristicsPersistenceFlag;
+    m_filmGrain->m_filmGrainModelId = m_fg.m_filmGrainModelId;
+    m_filmGrain->m_separateColourDescriptionPresentFlag = m_fg.m_separateColourDescriptionPresentFlag;
+    if (m_filmGrain->m_separateColourDescriptionPresentFlag)
+    {
+        ColourDescription m_clr;
+        x265_fread((char* )&m_clr, sizeof(bool) + sizeof(uint8_t) * 5, 1, filmgrain, errorMessage);
+        m_filmGrain->m_filmGrainBitDepthLumaMinus8 = m_clr.m_filmGrainBitDepthLumaMinus8;
+        m_filmGrain->m_filmGrainBitDepthChromaMinus8 = m_clr.m_filmGrainBitDepthChromaMinus8;
+        m_filmGrain->m_filmGrainFullRangeFlag = m_clr.m_filmGrainFullRangeFlag;
+        m_filmGrain->m_filmGrainColourPrimaries = m_clr.m_filmGrainColourPrimaries;
+        m_filmGrain->m_filmGrainTransferCharacteristics = m_clr.m_filmGrainTransferCharacteristics;
+        m_filmGrain->m_filmGrainMatrixCoeffs = m_clr.m_filmGrainMatrixCoeffs;
+    }
+    FGPresent m_present;
+    x265_fread((char* )&m_present, sizeof(bool) * 3 + sizeof(uint8_t) * 2, 1, filmgrain, errorMessage);
+    m_filmGrain->m_blendingModeId = m_present.m_blendingModeId;
+    m_filmGrain->m_log2ScaleFactor = m_present.m_log2ScaleFactor;
+    m_filmGrain->m_compModel0.bPresentFlag = m_present.m_presentFlag0;
+    m_filmGrain->m_compModel1.bPresentFlag = m_present.m_presentFlag1;
+    m_filmGrain->m_compModel2.bPresentFlag = m_present.m_presentFlag2;
+    for (int i = 0; i < MAX_NUM_COMPONENT; i++)
+    {
+        if (m_filmGrain->m_compModeli.bPresentFlag)
+        {
+            x265_fread((char* )(&m_filmGrain->m_compModeli.m_filmGrainNumIntensityIntervalMinus1), sizeof(uint8_t), 1, filmgrain, errorMessage);
+            x265_fread((char* )(&m_filmGrain->m_compModeli.numModelValues), sizeof(uint8_t), 1, filmgrain, errorMessage);
+            m_filmGrain->m_compModeli.intensityValues = (FilmGrainCharacteristics::CompModelIntensityValues* ) malloc(sizeof(FilmGrainCharacteristics::CompModelIntensityValues) * (m_filmGrain->m_compModeli.m_filmGrainNumIntensityIntervalMinus1+1)) ;
+            for (int j = 0; j <= m_filmGrain->m_compModeli.m_filmGrainNumIntensityIntervalMinus1; j++)
+            {
+                x265_fread((char* )(&m_filmGrain->m_compModeli.intensityValuesj.intensityIntervalLowerBound), sizeof(uint8_t), 1, filmgrain, errorMessage);
+                x265_fread((char* )(&m_filmGrain->m_compModeli.intensityValuesj.intensityIntervalUpperBound), sizeof(uint8_t), 1, filmgrain, errorMessage);
+                m_filmGrain->m_compModeli.intensityValuesj.compModelValue = (int* ) malloc(sizeof(int) * (m_filmGrain->m_compModeli.numModelValues));
+                for (int k = 0; k < m_filmGrain->m_compModeli.numModelValues; k++)
+                {
+                    x265_fread((char* )(&m_filmGrain->m_compModeli.intensityValuesj.compModelValuek), sizeof(int), 1, filmgrain, errorMessage);
+                }
+            }
+        }
+    }
+}
 #if ENABLE_LIBVMAF
 void FrameEncoder::vmafFrameLevelScore()
 {
​

x265_3.5.tar.gz/source/encoder/frameencoder.h -> x265_3.6.tar.gz/source/encoder/frameencoder.h Changed

@@ -40,6 +40,7 @@
 #include "ratecontrol.h"
 #include "reference.h"
 #include "nal.h"
+#include "temporalfilter.h"
 
 namespace X265_NS {
 // private x265 namespace
@@ -113,6 +114,34 @@
     }
 };
 
+/*Film grain characteristics*/
+struct FilmGrain
+{
+    bool    m_filmGrainCharacteristicsCancelFlag;
+    bool    m_filmGrainCharacteristicsPersistenceFlag;
+    bool    m_separateColourDescriptionPresentFlag;
+    uint8_t m_filmGrainModelId;
+    uint8_t m_blendingModeId;
+    uint8_t m_log2ScaleFactor;
+};
+
+struct ColourDescription
+{
+    bool        m_filmGrainFullRangeFlag;
+    uint8_t     m_filmGrainBitDepthLumaMinus8;
+    uint8_t     m_filmGrainBitDepthChromaMinus8;
+    uint8_t     m_filmGrainColourPrimaries;
+    uint8_t     m_filmGrainTransferCharacteristics;
+    uint8_t     m_filmGrainMatrixCoeffs;
+};
+
+struct FGPresent
+{
+    uint8_t     m_blendingModeId;
+    uint8_t     m_log2ScaleFactor;
+    bool        m_presentFlag3;
+};
+
 // Manages the wave-front processing of a single encoding frame
 class FrameEncoder : public WaveFront, public Thread
 {
@@ -205,6 +234,10 @@
     FrameFilter              m_frameFilter;
     NALList                  m_nalList;
 
+    // initialization for mcstf
+    TemporalFilter*          m_frameEncTF;
+    TemporalFilterRefPicInfo m_mcstfRefListMAX_MCSTF_TEMPORAL_WINDOW_LENGTH;
+
     class WeightAnalysis : public BondedTaskGroup
     {
     public:
@@ -250,6 +283,7 @@
     void collectDynDataFrame();
     void computeAvgTrainingData();
     void collectDynDataRow(CUData& ctu, FrameStats* rowStats);    
+    void readModel(FilmGrainCharacteristics* m_filmGrain, FILE* filmgrain);
 };
 }

 
@@ -40,6 +40,7 @@
 #include "ratecontrol.h"
 #include "reference.h"
 #include "nal.h"
+#include "temporalfilter.h"
 
 namespace X265_NS {
 // private x265 namespace
@@ -113,6 +114,34 @@
     }
 };
 
+/*Film grain characteristics*/
+struct FilmGrain
+{
+    bool    m_filmGrainCharacteristicsCancelFlag;
+    bool    m_filmGrainCharacteristicsPersistenceFlag;
+    bool    m_separateColourDescriptionPresentFlag;
+    uint8_t m_filmGrainModelId;
+    uint8_t m_blendingModeId;
+    uint8_t m_log2ScaleFactor;
+};
+
+struct ColourDescription
+{
+    bool        m_filmGrainFullRangeFlag;
+    uint8_t     m_filmGrainBitDepthLumaMinus8;
+    uint8_t     m_filmGrainBitDepthChromaMinus8;
+    uint8_t     m_filmGrainColourPrimaries;
+    uint8_t     m_filmGrainTransferCharacteristics;
+    uint8_t     m_filmGrainMatrixCoeffs;
+};
+
+struct FGPresent
+{
+    uint8_t     m_blendingModeId;
+    uint8_t     m_log2ScaleFactor;
+    bool        m_presentFlag3;
+};
+
 // Manages the wave-front processing of a single encoding frame
 class FrameEncoder : public WaveFront, public Thread
 {
@@ -205,6 +234,10 @@
     FrameFilter              m_frameFilter;
     NALList                  m_nalList;
 
+    // initialization for mcstf
+    TemporalFilter*          m_frameEncTF;
+    TemporalFilterRefPicInfo m_mcstfRefListMAX_MCSTF_TEMPORAL_WINDOW_LENGTH;
+
     class WeightAnalysis : public BondedTaskGroup
     {
     public:
@@ -250,6 +283,7 @@
     void collectDynDataFrame();
     void computeAvgTrainingData();
     void collectDynDataRow(CUData& ctu, FrameStats* rowStats);    
+    void readModel(FilmGrainCharacteristics* m_filmGrain, FILE* filmgrain);
 };
 }
 
​

x265_3.5.tar.gz/source/encoder/level.cpp -> x265_3.6.tar.gz/source/encoder/level.cpp Changed

@@ -72,7 +72,7 @@
      * for intra-only profiles (vps.ptl.intraConstraintFlag) */
     vps.ptl.lowerBitRateConstraintFlag = true;
 
-    vps.maxTempSubLayers = param.bEnableTemporalSubLayers ? 2 : 1;
+    vps.maxTempSubLayers = !!param.bEnableTemporalSubLayers ? param.bEnableTemporalSubLayers : 1;
     
     if (param.internalCsp == X265_CSP_I420 && param.internalBitDepth <= 10)
     {
@@ -167,7 +167,7 @@
 
         /* The value of sps_max_dec_pic_buffering_minus1 HighestTid  + 1 shall be less than
          * or equal to MaxDpbSize */
-        if (vps.maxDecPicBuffering > maxDpbSize)
+        if (vps.maxDecPicBufferingvps.maxTempSubLayers - 1 > maxDpbSize)
             continue;
 
         /* For level 5 and higher levels, the value of CtbSizeY shall be equal to 32 or 64 */
@@ -182,8 +182,8 @@
         }
 
         /* The value of NumPocTotalCurr shall be less than or equal to 8 */
-        int numPocTotalCurr = param.maxNumReferences + vps.numReorderPics;
-        if (numPocTotalCurr > 8)
+        int numPocTotalCurr = param.maxNumReferences + vps.numReorderPicsvps.maxTempSubLayers - 1;
+        if (numPocTotalCurr > 10)
         {
             x265_log(&param, X265_LOG_WARNING, "level %s detected, but NumPocTotalCurr (total references) is non-compliant\n", levelsi.name);
             vps.ptl.profileIdc = Profile::NONE;
@@ -289,9 +289,40 @@
  * circumstances it will be quite noisy */
 bool enforceLevel(x265_param& param, VPS& vps)
 {
-    vps.numReorderPics = (param.bBPyramid && param.bframes > 1) ? 2 : !!param.bframes;
-    vps.maxDecPicBuffering = X265_MIN(MAX_NUM_REF, X265_MAX(vps.numReorderPics + 2, (uint32_t)param.maxNumReferences) + 1);
+    vps.maxTempSubLayers = !!param.bEnableTemporalSubLayers ? param.bEnableTemporalSubLayers : 1;
+    for (uint32_t i = 0; i < vps.maxTempSubLayers; i++)
+    {
+        vps.numReorderPicsi = (i == 0) ? ((param.bBPyramid && param.bframes > 1) ? 2 : !!param.bframes) : i;
+        vps.maxDecPicBufferingi = X265_MIN(MAX_NUM_REF, X265_MAX(vps.numReorderPicsi + 2, (uint32_t)param.maxNumReferences) + 1);
+    }
 
+    if (!!param.bEnableTemporalSubLayers)
+    {
+        for (int i = 0; i < MAX_T_LAYERS - 1; i++)
+        {
+            // a lower layer can not have higher value of numReorderPics than a higher layer
+            if (vps.numReorderPicsi + 1 < vps.numReorderPicsi)
+            {
+                vps.numReorderPicsi + 1 = vps.numReorderPicsi;
+            }
+            // the value of numReorderPicsi shall be in the range of 0 to maxDecPicBufferingi - 1, inclusive
+            if (vps.numReorderPicsi > vps.maxDecPicBufferingi - 1)
+            {
+                vps.maxDecPicBufferingi = vps.numReorderPicsi + 1;
+            }
+            // a lower layer can not have higher value of maxDecPicBuffering than a higher layer
+            if (vps.maxDecPicBufferingi + 1 < vps.maxDecPicBufferingi)
+            {
+                vps.maxDecPicBufferingi + 1 = vps.maxDecPicBufferingi;
+            }
+        }
+
+        // the value of numReorderPicsi shall be in the range of 0 to maxDecPicBuffering i  -  1, inclusive
+        if (vps.numReorderPicsMAX_T_LAYERS - 1 > vps.maxDecPicBufferingMAX_T_LAYERS - 1 - 1)
+        {
+            vps.maxDecPicBufferingMAX_T_LAYERS - 1 = vps.numReorderPicsMAX_T_LAYERS - 1 + 1;
+        }
+    }
     /* no level specified by user, just auto-detect from the configuration */
     if (param.levelIdc <= 0)
         return true;
@@ -391,10 +422,10 @@
     }
 
     int savedRefCount = param.maxNumReferences;
-    while (vps.maxDecPicBuffering > maxDpbSize && param.maxNumReferences > 1)
+    while (vps.maxDecPicBufferingvps.maxTempSubLayers - 1 > maxDpbSize && param.maxNumReferences > 1)
     {
         param.maxNumReferences--;
-        vps.maxDecPicBuffering = X265_MIN(MAX_NUM_REF, X265_MAX(vps.numReorderPics + 1, (uint32_t)param.maxNumReferences) + 1);
+        vps.maxDecPicBufferingvps.maxTempSubLayers - 1 = X265_MIN(MAX_NUM_REF, X265_MAX(vps.numReorderPicsvps.maxTempSubLayers - 1 + 1, (uint32_t)param.maxNumReferences) + 1);
     }
     if (param.maxNumReferences != savedRefCount)
         x265_log(&param, X265_LOG_WARNING, "Lowering max references to %d to meet level requirement\n", param.maxNumReferences);

 
@@ -72,7 +72,7 @@
      * for intra-only profiles (vps.ptl.intraConstraintFlag) */
     vps.ptl.lowerBitRateConstraintFlag = true;
 
-    vps.maxTempSubLayers = param.bEnableTemporalSubLayers ? 2 : 1;
+    vps.maxTempSubLayers = !!param.bEnableTemporalSubLayers ? param.bEnableTemporalSubLayers : 1;
     
     if (param.internalCsp == X265_CSP_I420 && param.internalBitDepth <= 10)
     {
@@ -167,7 +167,7 @@
 
         /* The value of sps_max_dec_pic_buffering_minus1 HighestTid  + 1 shall be less than
          * or equal to MaxDpbSize */
-        if (vps.maxDecPicBuffering > maxDpbSize)
+        if (vps.maxDecPicBufferingvps.maxTempSubLayers - 1 > maxDpbSize)
             continue;
 
         /* For level 5 and higher levels, the value of CtbSizeY shall be equal to 32 or 64 */
@@ -182,8 +182,8 @@
         }
 
         /* The value of NumPocTotalCurr shall be less than or equal to 8 */
-        int numPocTotalCurr = param.maxNumReferences + vps.numReorderPics;
-        if (numPocTotalCurr > 8)
+        int numPocTotalCurr = param.maxNumReferences + vps.numReorderPicsvps.maxTempSubLayers - 1;
+        if (numPocTotalCurr > 10)
         {
             x265_log(&param, X265_LOG_WARNING, "level %s detected, but NumPocTotalCurr (total references) is non-compliant\n", levelsi.name);
             vps.ptl.profileIdc = Profile::NONE;
@@ -289,9 +289,40 @@
  * circumstances it will be quite noisy */
 bool enforceLevel(x265_param& param, VPS& vps)
 {
-    vps.numReorderPics = (param.bBPyramid && param.bframes > 1) ? 2 : !!param.bframes;
-    vps.maxDecPicBuffering = X265_MIN(MAX_NUM_REF, X265_MAX(vps.numReorderPics + 2, (uint32_t)param.maxNumReferences) + 1);
+    vps.maxTempSubLayers = !!param.bEnableTemporalSubLayers ? param.bEnableTemporalSubLayers : 1;
+    for (uint32_t i = 0; i < vps.maxTempSubLayers; i++)
+    {
+        vps.numReorderPicsi = (i == 0) ? ((param.bBPyramid && param.bframes > 1) ? 2 : !!param.bframes) : i;
+        vps.maxDecPicBufferingi = X265_MIN(MAX_NUM_REF, X265_MAX(vps.numReorderPicsi + 2, (uint32_t)param.maxNumReferences) + 1);
+    }
 
+    if (!!param.bEnableTemporalSubLayers)
+    {
+        for (int i = 0; i < MAX_T_LAYERS - 1; i++)
+        {
+            // a lower layer can not have higher value of numReorderPics than a higher layer
+            if (vps.numReorderPicsi + 1 < vps.numReorderPicsi)
+            {
+                vps.numReorderPicsi + 1 = vps.numReorderPicsi;
+            }
+            // the value of numReorderPicsi shall be in the range of 0 to maxDecPicBufferingi - 1, inclusive
+            if (vps.numReorderPicsi > vps.maxDecPicBufferingi - 1)
+            {
+                vps.maxDecPicBufferingi = vps.numReorderPicsi + 1;
+            }
+            // a lower layer can not have higher value of maxDecPicBuffering than a higher layer
+            if (vps.maxDecPicBufferingi + 1 < vps.maxDecPicBufferingi)
+            {
+                vps.maxDecPicBufferingi + 1 = vps.maxDecPicBufferingi;
+            }
+        }
+
+        // the value of numReorderPicsi shall be in the range of 0 to maxDecPicBuffering i  -  1, inclusive
+        if (vps.numReorderPicsMAX_T_LAYERS - 1 > vps.maxDecPicBufferingMAX_T_LAYERS - 1 - 1)
+        {
+            vps.maxDecPicBufferingMAX_T_LAYERS - 1 = vps.numReorderPicsMAX_T_LAYERS - 1 + 1;
+        }
+    }
     /* no level specified by user, just auto-detect from the configuration */
     if (param.levelIdc <= 0)
         return true;
@@ -391,10 +422,10 @@
     }
 
     int savedRefCount = param.maxNumReferences;
-    while (vps.maxDecPicBuffering > maxDpbSize && param.maxNumReferences > 1)
+    while (vps.maxDecPicBufferingvps.maxTempSubLayers - 1 > maxDpbSize && param.maxNumReferences > 1)
     {
         param.maxNumReferences--;
-        vps.maxDecPicBuffering = X265_MIN(MAX_NUM_REF, X265_MAX(vps.numReorderPics + 1, (uint32_t)param.maxNumReferences) + 1);
+        vps.maxDecPicBufferingvps.maxTempSubLayers - 1 = X265_MIN(MAX_NUM_REF, X265_MAX(vps.numReorderPicsvps.maxTempSubLayers - 1 + 1, (uint32_t)param.maxNumReferences) + 1);
     }
     if (param.maxNumReferences != savedRefCount)
         x265_log(&param, X265_LOG_WARNING, "Lowering max references to %d to meet level requirement\n", param.maxNumReferences);
​

x265_3.5.tar.gz/source/encoder/motion.cpp -> x265_3.6.tar.gz/source/encoder/motion.cpp Changed

@@ -190,6 +190,31 @@
     X265_CHECK(!bChromaSATD, "chroma distortion measurements impossible in this code path\n");
 }
 
+/* Called by lookahead, luma only, no use of PicYuv */
+void MotionEstimate::setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight, const int method, const int refine)
+{
+    partEnum = partitionFromSizes(pwidth, pheight);
+    X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n");
+    sad = primitives.pupartEnum.sad;
+    ads = primitives.pupartEnum.ads;
+    satd = primitives.pupartEnum.satd;
+    sad_x3 = primitives.pupartEnum.sad_x3;
+    sad_x4 = primitives.pupartEnum.sad_x4;
+
+
+    blockwidth = pwidth;
+    blockOffset = offset;
+    absPartIdx = ctuAddr = -1;
+
+    /* Search params */
+    searchMethod = method;
+    subpelRefine = refine;
+
+    /* copy PU block into cache */
+    primitives.pupartEnum.copy_pp(fencPUYuv.m_buf0, FENC_STRIDE, fencY + offset, stride);
+    X265_CHECK(!bChromaSATD, "chroma distortion measurements impossible in this code path\n");
+}
+
 /* Called by Search::predInterSearch() or --pme equivalent, chroma residual might be considered */
 void MotionEstimate::setSourcePU(const Yuv& srcFencYuv, int _ctuAddr, int cuPartIdx, int puPartIdx, int pwidth, int pheight, const int method, const int refine, bool bChroma)
 {

 
@@ -190,6 +190,31 @@
     X265_CHECK(!bChromaSATD, "chroma distortion measurements impossible in this code path\n");
 }
 
+/* Called by lookahead, luma only, no use of PicYuv */
+void MotionEstimate::setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight, const int method, const int refine)
+{
+    partEnum = partitionFromSizes(pwidth, pheight);
+    X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n");
+    sad = primitives.pupartEnum.sad;
+    ads = primitives.pupartEnum.ads;
+    satd = primitives.pupartEnum.satd;
+    sad_x3 = primitives.pupartEnum.sad_x3;
+    sad_x4 = primitives.pupartEnum.sad_x4;
+
+
+    blockwidth = pwidth;
+    blockOffset = offset;
+    absPartIdx = ctuAddr = -1;
+
+    /* Search params */
+    searchMethod = method;
+    subpelRefine = refine;
+
+    /* copy PU block into cache */
+    primitives.pupartEnum.copy_pp(fencPUYuv.m_buf0, FENC_STRIDE, fencY + offset, stride);
+    X265_CHECK(!bChromaSATD, "chroma distortion measurements impossible in this code path\n");
+}
+
 /* Called by Search::predInterSearch() or --pme equivalent, chroma residual might be considered */
 void MotionEstimate::setSourcePU(const Yuv& srcFencYuv, int _ctuAddr, int cuPartIdx, int puPartIdx, int pwidth, int pheight, const int method, const int refine, bool bChroma)
 {
​

x265_3.5.tar.gz/source/encoder/motion.h -> x265_3.6.tar.gz/source/encoder/motion.h Changed

 
@@ -77,7 +77,7 @@
     void init(int csp);
 
     /* Methods called at slice setup */
-
+    void setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight, const int searchMethod, const int subpelRefine);
     void setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight, const int searchMethod, const int searchL0, const int searchL1, const int subpelRefine);
     void setSourcePU(const Yuv& srcFencYuv, int ctuAddr, int cuPartIdx, int puPartIdx, int pwidth, int pheight, const int searchMethod, const int subpelRefine, bool bChroma);
 
​

x265_3.5.tar.gz/source/encoder/nal.cpp -> x265_3.6.tar.gz/source/encoder/nal.cpp Changed

 
@@ -57,7 +57,7 @@
     other.m_buffer = X265_MALLOC(uint8_t, m_allocSize);
 }
 
-void NALList::serialize(NalUnitType nalUnitType, const Bitstream& bs)
+void NALList::serialize(NalUnitType nalUnitType, const Bitstream& bs, uint8_t temporalID)
 {
     static const char startCodePrefix = { 0, 0, 0, 1 };
 
@@ -114,7 +114,7 @@
      * nuh_reserved_zero_6bits  6-bits
      * nuh_temporal_id_plus1    3-bits */
     outbytes++ = (uint8_t)nalUnitType << 1;
-    outbytes++ = 1 + (nalUnitType == NAL_UNIT_CODED_SLICE_TSA_N);
+    outbytes++ = temporalID;
 
     /* 7.4.1 ...
      * Within the NAL unit, the following three-byte sequences shall not occur at
​

x265_3.5.tar.gz/source/encoder/nal.h -> x265_3.6.tar.gz/source/encoder/nal.h Changed

 
@@ -56,7 +56,7 @@
 
     void takeContents(NALList& other);
 
-    void serialize(NalUnitType nalUnitType, const Bitstream& bs);
+    void serialize(NalUnitType nalUnitType, const Bitstream& bs, uint8_t temporalID = 1);
 
     uint32_t serializeSubstreams(uint32_t* streamSizeBytes, uint32_t streamCount, const Bitstream* streams);
 };
​

x265_3.5.tar.gz/source/encoder/ratecontrol.cpp -> x265_3.6.tar.gz/source/encoder/ratecontrol.cpp Changed

@@ -41,6 +41,10 @@
 #define BR_SHIFT  6
 #define CPB_SHIFT 4
 
+#define SHARED_DATA_ALIGNMENT      4 ///< 4btye, 32bit
+#define CUTREE_SHARED_MEM_NAME     "cutree"
+#define GOP_CNT_CU_TREE            3
+
 using namespace X265_NS;
 
 /* Amortize the partial cost of I frames over the next N frames */
@@ -104,6 +108,37 @@
     return output;
 }
 
+typedef struct CUTreeSharedDataItem
+{
+    uint8_t  *type;
+    uint16_t *stats;
+}CUTreeSharedDataItem;
+
+void static ReadSharedCUTreeData(void *dst, void *src, int32_t size)
+{
+    CUTreeSharedDataItem *statsDst = reinterpret_cast<CUTreeSharedDataItem *>(dst);
+    uint8_t *typeSrc = reinterpret_cast<uint8_t *>(src);
+    *statsDst->type = *typeSrc;
+
+    ///< for memory alignment, the type will take 32bit in the shared memory
+    int32_t offset = (sizeof(*statsDst->type) + SHARED_DATA_ALIGNMENT - 1) & ~(SHARED_DATA_ALIGNMENT - 1);
+    uint16_t *statsSrc = reinterpret_cast<uint16_t *>(typeSrc + offset);
+    memcpy(statsDst->stats, statsSrc, size - offset);
+}
+
+void static WriteSharedCUTreeData(void *dst, void *src, int32_t size)
+{
+    CUTreeSharedDataItem *statsSrc = reinterpret_cast<CUTreeSharedDataItem *>(src);
+    uint8_t *typeDst = reinterpret_cast<uint8_t *>(dst);
+    *typeDst = *statsSrc->type;
+
+    ///< for memory alignment, the type will take 32bit in the shared memory
+    int32_t offset = (sizeof(*statsSrc->type) + SHARED_DATA_ALIGNMENT - 1) & ~(SHARED_DATA_ALIGNMENT - 1);
+    uint16_t *statsDst = reinterpret_cast<uint16_t *>(typeDst + offset);
+    memcpy(statsDst, statsSrc->stats, size - offset);
+}
+
+
 inline double qScale2bits(RateControlEntry *rce, double qScale)
 {
     if (qScale < 0.1)
@@ -209,6 +244,7 @@
     m_lastAbrResetPoc = -1;
     m_statFileOut = NULL;
     m_cutreeStatFileOut = m_cutreeStatFileIn = NULL;
+    m_cutreeShrMem = NULL;
     m_rce2Pass = NULL;
     m_encOrder = NULL;
     m_lastBsliceSatdCost = 0;
@@ -224,6 +260,8 @@
     m_initVbv = false;
     m_singleFrameVbv = 0;
     m_rateTolerance = 1.0;
+    m_encodedSegmentBits = 0;
+    m_segDur = 0;
 
     if (m_param->rc.vbvBufferSize)
     {
@@ -320,47 +358,86 @@
         m_cuTreeStats.qpBufferi = NULL;
 }
 
-bool RateControl::init(const SPS& sps)
+bool RateControl::initCUTreeSharedMem()
 {
-    if (m_isVbv && !m_initVbv)
-    {
-        /* We don't support changing the ABR bitrate right now,
-         * so if the stream starts as CBR, keep it CBR. */
-        if (m_param->rc.vbvBufferSize < (int)(m_param->rc.vbvMaxBitrate / m_fps))
+    if (!m_cutreeShrMem) {
+        m_cutreeShrMem = new RingMem();
+        if (!m_cutreeShrMem)
         {
-            m_param->rc.vbvBufferSize = (int)(m_param->rc.vbvMaxBitrate / m_fps);
-            x265_log(m_param, X265_LOG_WARNING, "VBV buffer size cannot be smaller than one frame, using %d kbit\n",
-                     m_param->rc.vbvBufferSize);
+            return false;
         }
-        int vbvBufferSize = m_param->rc.vbvBufferSize * 1000;
-        int vbvMaxBitrate = m_param->rc.vbvMaxBitrate * 1000;
 
-        if (m_param->bEmitHRDSEI && !m_param->decoderVbvMaxRate)
+        ///< now cutree data form at most 3 gops would be stored in the shared memory at the same time
+        int32_t itemSize = (sizeof(uint8_t) + SHARED_DATA_ALIGNMENT - 1) & ~(SHARED_DATA_ALIGNMENT - 1);
+        if (m_param->rc.qgSize == 8)
         {
-            const HRDInfo* hrd = &sps.vuiParameters.hrdParameters;
-            vbvBufferSize = hrd->cpbSizeValue << (hrd->cpbSizeScale + CPB_SHIFT);
-            vbvMaxBitrate = hrd->bitRateValue << (hrd->bitRateScale + BR_SHIFT);
+            itemSize += sizeof(uint16_t) * m_ncu * 4;
         }
-        m_bufferRate = vbvMaxBitrate / m_fps;
-        m_vbvMaxRate = vbvMaxBitrate;
-        m_bufferSize = vbvBufferSize;
-        m_singleFrameVbv = m_bufferRate * 1.1 > m_bufferSize;
+        else
+        {
+            itemSize += sizeof(uint16_t) * m_ncu;
+        }
+
+        int32_t itemCnt = X265_MIN(m_param->keyframeMax, (int)(m_fps + 0.5));
+        itemCnt *= GOP_CNT_CU_TREE;
 
-        if (m_param->rc.vbvBufferInit > 1.)
-            m_param->rc.vbvBufferInit = x265_clip3(0.0, 1.0, m_param->rc.vbvBufferInit / m_param->rc.vbvBufferSize);
-        if (m_param->vbvBufferEnd > 1.)
-            m_param->vbvBufferEnd = x265_clip3(0.0, 1.0, m_param->vbvBufferEnd / m_param->rc.vbvBufferSize);
-        if (m_param->vbvEndFrameAdjust > 1.)
-            m_param->vbvEndFrameAdjust = x265_clip3(0.0, 1.0, m_param->vbvEndFrameAdjust);
-        m_param->rc.vbvBufferInit = x265_clip3(0.0, 1.0, X265_MAX(m_param->rc.vbvBufferInit, m_bufferRate / m_bufferSize));
-        m_bufferFillFinal = m_bufferSize * m_param->rc.vbvBufferInit;
-        m_bufferFillActual = m_bufferFillFinal;
-        m_bufferExcess = 0;
-        m_minBufferFill = m_param->minVbvFullness / 100;
-        m_maxBufferFill = 1 - (m_param->maxVbvFullness / 100);
-        m_initVbv = true;
+        char shrnameMAX_SHR_NAME_LEN = { 0 };
+        strcpy(shrname, m_param->rc.sharedMemName);
+        strcat(shrname, CUTREE_SHARED_MEM_NAME);
+
+        if (!m_cutreeShrMem->init(itemSize, itemCnt, shrname))
+        {
+            return false;
+        }
     }
 
+    return true;
+}
+
+void RateControl::initVBV(const SPS& sps)
+{
+    /* We don't support changing the ABR bitrate right now,
+ * so if the stream starts as CBR, keep it CBR. */
+    if (m_param->rc.vbvBufferSize < (int)(m_param->rc.vbvMaxBitrate / m_fps))
+    {
+        m_param->rc.vbvBufferSize = (int)(m_param->rc.vbvMaxBitrate / m_fps);
+        x265_log(m_param, X265_LOG_WARNING, "VBV buffer size cannot be smaller than one frame, using %d kbit\n",
+            m_param->rc.vbvBufferSize);
+    }
+    int vbvBufferSize = m_param->rc.vbvBufferSize * 1000;
+    int vbvMaxBitrate = m_param->rc.vbvMaxBitrate * 1000;
+
+    if (m_param->bEmitHRDSEI && !m_param->decoderVbvMaxRate)
+    {
+        const HRDInfo* hrd = &sps.vuiParameters.hrdParameters;
+        vbvBufferSize = hrd->cpbSizeValue << (hrd->cpbSizeScale + CPB_SHIFT);
+        vbvMaxBitrate = hrd->bitRateValue << (hrd->bitRateScale + BR_SHIFT);
+    }
+    m_bufferRate = vbvMaxBitrate / m_fps;
+    m_vbvMaxRate = vbvMaxBitrate;
+    m_bufferSize = vbvBufferSize;
+    m_singleFrameVbv = m_bufferRate * 1.1 > m_bufferSize;
+
+    if (m_param->rc.vbvBufferInit > 1.)
+        m_param->rc.vbvBufferInit = x265_clip3(0.0, 1.0, m_param->rc.vbvBufferInit / m_param->rc.vbvBufferSize);
+    if (m_param->vbvBufferEnd > 1.)
+        m_param->vbvBufferEnd = x265_clip3(0.0, 1.0, m_param->vbvBufferEnd / m_param->rc.vbvBufferSize);
+    if (m_param->vbvEndFrameAdjust > 1.)
+        m_param->vbvEndFrameAdjust = x265_clip3(0.0, 1.0, m_param->vbvEndFrameAdjust);
+    m_param->rc.vbvBufferInit = x265_clip3(0.0, 1.0, X265_MAX(m_param->rc.vbvBufferInit, m_bufferRate / m_bufferSize));
+    m_bufferFillFinal = m_bufferSize * m_param->rc.vbvBufferInit;
+    m_bufferFillActual = m_bufferFillFinal;
+    m_bufferExcess = 0;
+    m_minBufferFill = m_param->minVbvFullness / 100;
+    m_maxBufferFill = 1 - (m_param->maxVbvFullness / 100);
+    m_initVbv = true;
+}
+
+bool RateControl::init(const SPS& sps)
+{
+    if (m_isVbv && !m_initVbv)
+        initVBV(sps);
+
     if (!m_param->bResetZoneConfig && (m_relativeComplexity == NULL))
     {
         m_relativeComplexity = X265_MALLOC(double, m_param->reconfigWindowSize);
@@ -373,7 +450,9 @@
 
     m_totalBits = 0;
     m_encodedBits = 0;
+    m_encodedSegmentBits = 0;
     m_framesDone = 0;
+    m_segDur = 0;
     m_residualCost = 0;
     m_partialResidualCost = 0;
     m_amortizeFraction = 0.85;
@@ -421,244 +500,257 @@
         /* Load stat file and init 2pass algo */
         if (m_param->rc.bStatRead)
         {
-            m_expectedBitsSum = 0;

 
@@ -41,6 +41,10 @@
 #define BR_SHIFT  6
 #define CPB_SHIFT 4
 
+#define SHARED_DATA_ALIGNMENT      4 ///< 4btye, 32bit
+#define CUTREE_SHARED_MEM_NAME     "cutree"
+#define GOP_CNT_CU_TREE            3
+
 using namespace X265_NS;
 
 /* Amortize the partial cost of I frames over the next N frames */
@@ -104,6 +108,37 @@
     return output;
 }
 
+typedef struct CUTreeSharedDataItem
+{
+    uint8_t  *type;
+    uint16_t *stats;
+}CUTreeSharedDataItem;
+
+void static ReadSharedCUTreeData(void *dst, void *src, int32_t size)
+{
+    CUTreeSharedDataItem *statsDst = reinterpret_cast<CUTreeSharedDataItem *>(dst);
+    uint8_t *typeSrc = reinterpret_cast<uint8_t *>(src);
+    *statsDst->type = *typeSrc;
+
+    ///< for memory alignment, the type will take 32bit in the shared memory
+    int32_t offset = (sizeof(*statsDst->type) + SHARED_DATA_ALIGNMENT - 1) & ~(SHARED_DATA_ALIGNMENT - 1);
+    uint16_t *statsSrc = reinterpret_cast<uint16_t *>(typeSrc + offset);
+    memcpy(statsDst->stats, statsSrc, size - offset);
+}
+
+void static WriteSharedCUTreeData(void *dst, void *src, int32_t size)
+{
+    CUTreeSharedDataItem *statsSrc = reinterpret_cast<CUTreeSharedDataItem *>(src);
+    uint8_t *typeDst = reinterpret_cast<uint8_t *>(dst);
+    *typeDst = *statsSrc->type;
+
+    ///< for memory alignment, the type will take 32bit in the shared memory
+    int32_t offset = (sizeof(*statsSrc->type) + SHARED_DATA_ALIGNMENT - 1) & ~(SHARED_DATA_ALIGNMENT - 1);
+    uint16_t *statsDst = reinterpret_cast<uint16_t *>(typeDst + offset);
+    memcpy(statsDst, statsSrc->stats, size - offset);
+}
+
+
 inline double qScale2bits(RateControlEntry *rce, double qScale)
 {
     if (qScale < 0.1)
@@ -209,6 +244,7 @@
     m_lastAbrResetPoc = -1;
     m_statFileOut = NULL;
     m_cutreeStatFileOut = m_cutreeStatFileIn = NULL;
+    m_cutreeShrMem = NULL;
     m_rce2Pass = NULL;
     m_encOrder = NULL;
     m_lastBsliceSatdCost = 0;
@@ -224,6 +260,8 @@
     m_initVbv = false;
     m_singleFrameVbv = 0;
     m_rateTolerance = 1.0;
+    m_encodedSegmentBits = 0;
+    m_segDur = 0;
 
     if (m_param->rc.vbvBufferSize)
     {
@@ -320,47 +358,86 @@
         m_cuTreeStats.qpBufferi = NULL;
 }
 
-bool RateControl::init(const SPS& sps)
+bool RateControl::initCUTreeSharedMem()
 {
-    if (m_isVbv && !m_initVbv)
-    {
-        /* We don't support changing the ABR bitrate right now,
-         * so if the stream starts as CBR, keep it CBR. */
-        if (m_param->rc.vbvBufferSize < (int)(m_param->rc.vbvMaxBitrate / m_fps))
+    if (!m_cutreeShrMem) {
+        m_cutreeShrMem = new RingMem();
+        if (!m_cutreeShrMem)
         {
-            m_param->rc.vbvBufferSize = (int)(m_param->rc.vbvMaxBitrate / m_fps);
-            x265_log(m_param, X265_LOG_WARNING, "VBV buffer size cannot be smaller than one frame, using %d kbit\n",
-                     m_param->rc.vbvBufferSize);
+            return false;
         }
-        int vbvBufferSize = m_param->rc.vbvBufferSize * 1000;
-        int vbvMaxBitrate = m_param->rc.vbvMaxBitrate * 1000;
 
-        if (m_param->bEmitHRDSEI && !m_param->decoderVbvMaxRate)
+        ///< now cutree data form at most 3 gops would be stored in the shared memory at the same time
+        int32_t itemSize = (sizeof(uint8_t) + SHARED_DATA_ALIGNMENT - 1) & ~(SHARED_DATA_ALIGNMENT - 1);
+        if (m_param->rc.qgSize == 8)
         {
-            const HRDInfo* hrd = &sps.vuiParameters.hrdParameters;
-            vbvBufferSize = hrd->cpbSizeValue << (hrd->cpbSizeScale + CPB_SHIFT);
-            vbvMaxBitrate = hrd->bitRateValue << (hrd->bitRateScale + BR_SHIFT);
+            itemSize += sizeof(uint16_t) * m_ncu * 4;
         }
-        m_bufferRate = vbvMaxBitrate / m_fps;
-        m_vbvMaxRate = vbvMaxBitrate;
-        m_bufferSize = vbvBufferSize;
-        m_singleFrameVbv = m_bufferRate * 1.1 > m_bufferSize;
+        else
+        {
+            itemSize += sizeof(uint16_t) * m_ncu;
+        }
+
+        int32_t itemCnt = X265_MIN(m_param->keyframeMax, (int)(m_fps + 0.5));
+        itemCnt *= GOP_CNT_CU_TREE;
 
-        if (m_param->rc.vbvBufferInit > 1.)
-            m_param->rc.vbvBufferInit = x265_clip3(0.0, 1.0, m_param->rc.vbvBufferInit / m_param->rc.vbvBufferSize);
-        if (m_param->vbvBufferEnd > 1.)
-            m_param->vbvBufferEnd = x265_clip3(0.0, 1.0, m_param->vbvBufferEnd / m_param->rc.vbvBufferSize);
-        if (m_param->vbvEndFrameAdjust > 1.)
-            m_param->vbvEndFrameAdjust = x265_clip3(0.0, 1.0, m_param->vbvEndFrameAdjust);
-        m_param->rc.vbvBufferInit = x265_clip3(0.0, 1.0, X265_MAX(m_param->rc.vbvBufferInit, m_bufferRate / m_bufferSize));
-        m_bufferFillFinal = m_bufferSize * m_param->rc.vbvBufferInit;
-        m_bufferFillActual = m_bufferFillFinal;
-        m_bufferExcess = 0;
-        m_minBufferFill = m_param->minVbvFullness / 100;
-        m_maxBufferFill = 1 - (m_param->maxVbvFullness / 100);
-        m_initVbv = true;
+        char shrnameMAX_SHR_NAME_LEN = { 0 };
+        strcpy(shrname, m_param->rc.sharedMemName);
+        strcat(shrname, CUTREE_SHARED_MEM_NAME);
+
+        if (!m_cutreeShrMem->init(itemSize, itemCnt, shrname))
+        {
+            return false;
+        }
     }
 
+    return true;
+}
+
+void RateControl::initVBV(const SPS& sps)
+{
+    /* We don't support changing the ABR bitrate right now,
+ * so if the stream starts as CBR, keep it CBR. */
+    if (m_param->rc.vbvBufferSize < (int)(m_param->rc.vbvMaxBitrate / m_fps))
+    {
+        m_param->rc.vbvBufferSize = (int)(m_param->rc.vbvMaxBitrate / m_fps);
+        x265_log(m_param, X265_LOG_WARNING, "VBV buffer size cannot be smaller than one frame, using %d kbit\n",
+            m_param->rc.vbvBufferSize);
+    }
+    int vbvBufferSize = m_param->rc.vbvBufferSize * 1000;
+    int vbvMaxBitrate = m_param->rc.vbvMaxBitrate * 1000;
+
+    if (m_param->bEmitHRDSEI && !m_param->decoderVbvMaxRate)
+    {
+        const HRDInfo* hrd = &sps.vuiParameters.hrdParameters;
+        vbvBufferSize = hrd->cpbSizeValue << (hrd->cpbSizeScale + CPB_SHIFT);
+        vbvMaxBitrate = hrd->bitRateValue << (hrd->bitRateScale + BR_SHIFT);
+    }
+    m_bufferRate = vbvMaxBitrate / m_fps;
+    m_vbvMaxRate = vbvMaxBitrate;
+    m_bufferSize = vbvBufferSize;
+    m_singleFrameVbv = m_bufferRate * 1.1 > m_bufferSize;
+
+    if (m_param->rc.vbvBufferInit > 1.)
+        m_param->rc.vbvBufferInit = x265_clip3(0.0, 1.0, m_param->rc.vbvBufferInit / m_param->rc.vbvBufferSize);
+    if (m_param->vbvBufferEnd > 1.)
+        m_param->vbvBufferEnd = x265_clip3(0.0, 1.0, m_param->vbvBufferEnd / m_param->rc.vbvBufferSize);
+    if (m_param->vbvEndFrameAdjust > 1.)
+        m_param->vbvEndFrameAdjust = x265_clip3(0.0, 1.0, m_param->vbvEndFrameAdjust);
+    m_param->rc.vbvBufferInit = x265_clip3(0.0, 1.0, X265_MAX(m_param->rc.vbvBufferInit, m_bufferRate / m_bufferSize));
+    m_bufferFillFinal = m_bufferSize * m_param->rc.vbvBufferInit;
+    m_bufferFillActual = m_bufferFillFinal;
+    m_bufferExcess = 0;
+    m_minBufferFill = m_param->minVbvFullness / 100;
+    m_maxBufferFill = 1 - (m_param->maxVbvFullness / 100);
+    m_initVbv = true;
+}
+
+bool RateControl::init(const SPS& sps)
+{
+    if (m_isVbv && !m_initVbv)
+        initVBV(sps);
+
     if (!m_param->bResetZoneConfig && (m_relativeComplexity == NULL))
     {
         m_relativeComplexity = X265_MALLOC(double, m_param->reconfigWindowSize);
@@ -373,7 +450,9 @@
 
     m_totalBits = 0;
     m_encodedBits = 0;
+    m_encodedSegmentBits = 0;
     m_framesDone = 0;
+    m_segDur = 0;
     m_residualCost = 0;
     m_partialResidualCost = 0;
     m_amortizeFraction = 0.85;
@@ -421,244 +500,257 @@
         /* Load stat file and init 2pass algo */
         if (m_param->rc.bStatRead)
         {
-            m_expectedBitsSum = 0;
​

x265_3.5.tar.gz/source/encoder/ratecontrol.h -> x265_3.6.tar.gz/source/encoder/ratecontrol.h Changed

@@ -28,6 +28,7 @@
 
 #include "common.h"
 #include "sei.h"
+#include "ringmem.h"
 
 namespace X265_NS {
 // encoder namespace
@@ -46,11 +47,6 @@
 #define MIN_AMORTIZE_FRACTION 0.2
 #define CLIP_DURATION(f) x265_clip3(MIN_FRAME_DURATION, MAX_FRAME_DURATION, f)
 
-/*Scenecut Aware QP*/
-#define WINDOW1_DELTA           1.0 /* The offset for the frames coming in the window-1*/
-#define WINDOW2_DELTA           0.7 /* The offset for the frames coming in the window-2*/
-#define WINDOW3_DELTA           0.4 /* The offset for the frames coming in the window-3*/
-
 struct Predictor
 {
     double coeffMin;
@@ -73,6 +69,7 @@
     Predictor  rowPreds32;
     Predictor* rowPred2;
 
+    int64_t currentSatd;
     int64_t lastSatd;      /* Contains the picture cost of the previous frame, required for resetAbr and VBV */
     int64_t leadingNoBSatd;
     int64_t rowTotalBits;  /* update cplxrsum and totalbits at the end of 2 rows */
@@ -87,6 +84,8 @@
     double  rowCplxrSum;
     double  qpNoVbv;
     double  bufferFill;
+    double  bufferFillFinal;
+    double  bufferFillActual;
     double  targetFill;
     bool    vbvEndAdj;
     double  frameDuration;
@@ -192,6 +191,8 @@
     double  m_qCompress;
     int64_t m_totalBits;        /* total bits used for already encoded frames (after ammortization) */
     int64_t m_encodedBits;      /* bits used for encoded frames (without ammortization) */
+    int64_t m_encodedSegmentBits;      /* bits used for encoded frames in a segment*/
+    double  m_segDur;
     double  m_fps;
     int64_t m_satdCostWindow50;
     int64_t m_encodedBitsWindow50;
@@ -237,6 +238,8 @@
     FILE*   m_statFileOut;
     FILE*   m_cutreeStatFileOut;
     FILE*   m_cutreeStatFileIn;
+    ///< store the cutree data in memory instead of file
+    RingMem *m_cutreeShrMem;
     double  m_lastAccumPNorm;
     double  m_expectedBitsSum;   /* sum of qscale2bits after rceq, ratefactor, and overflow, only includes finished frames */
     int64_t m_predictedBits;
@@ -254,6 +257,7 @@
     RateControl(x265_param& p, Encoder *enc);
     bool init(const SPS& sps);
     void initHRD(SPS& sps);
+    void initVBV(const SPS& sps);
     void reconfigureRC();
 
     void setFinalFrameCount(int count);
@@ -271,6 +275,9 @@
     int writeRateControlFrameStats(Frame* curFrame, RateControlEntry* rce);
     bool   initPass2();
 
+    bool initCUTreeSharedMem();
+    void skipCUTreeSharedMemRead(int32_t cnt);
+
     double forwardMasking(Frame* curFrame, double q);
     double backwardMasking(Frame* curFrame, double q);
 
@@ -291,6 +298,7 @@
     double rateEstimateQscale(Frame* pic, RateControlEntry *rce); // main logic for calculating QP based on ABR
     double tuneAbrQScaleFromFeedback(double qScale);
     double tuneQScaleForZone(RateControlEntry *rce, double qScale); // Tune qScale to adhere to zone budget
+    double tuneQscaleForSBRC(Frame* curFrame, double q); // Tune qScale to adhere to segment budget
     void   accumPQpUpdate();
 
     int    getPredictorType(int lowresSliceType, int sliceType);
@@ -311,6 +319,7 @@
     double tuneQScaleForGrain(double rcOverflow);
     void   splitdeltaPOC(char deltapoc, RateControlEntry *rce);
     void   splitbUsed(char deltapoc, RateControlEntry *rce);
+    void   checkAndResetCRF(RateControlEntry* rce);
 };
 }
 #endif // ifndef X265_RATECONTROL_H

 
@@ -28,6 +28,7 @@
 
 #include "common.h"
 #include "sei.h"
+#include "ringmem.h"
 
 namespace X265_NS {
 // encoder namespace
@@ -46,11 +47,6 @@
 #define MIN_AMORTIZE_FRACTION 0.2
 #define CLIP_DURATION(f) x265_clip3(MIN_FRAME_DURATION, MAX_FRAME_DURATION, f)
 
-/*Scenecut Aware QP*/
-#define WINDOW1_DELTA           1.0 /* The offset for the frames coming in the window-1*/
-#define WINDOW2_DELTA           0.7 /* The offset for the frames coming in the window-2*/
-#define WINDOW3_DELTA           0.4 /* The offset for the frames coming in the window-3*/
-
 struct Predictor
 {
     double coeffMin;
@@ -73,6 +69,7 @@
     Predictor  rowPreds32;
     Predictor* rowPred2;
 
+    int64_t currentSatd;
     int64_t lastSatd;      /* Contains the picture cost of the previous frame, required for resetAbr and VBV */
     int64_t leadingNoBSatd;
     int64_t rowTotalBits;  /* update cplxrsum and totalbits at the end of 2 rows */
@@ -87,6 +84,8 @@
     double  rowCplxrSum;
     double  qpNoVbv;
     double  bufferFill;
+    double  bufferFillFinal;
+    double  bufferFillActual;
     double  targetFill;
     bool    vbvEndAdj;
     double  frameDuration;
@@ -192,6 +191,8 @@
     double  m_qCompress;
     int64_t m_totalBits;        /* total bits used for already encoded frames (after ammortization) */
     int64_t m_encodedBits;      /* bits used for encoded frames (without ammortization) */
+    int64_t m_encodedSegmentBits;      /* bits used for encoded frames in a segment*/
+    double  m_segDur;
     double  m_fps;
     int64_t m_satdCostWindow50;
     int64_t m_encodedBitsWindow50;
@@ -237,6 +238,8 @@
     FILE*   m_statFileOut;
     FILE*   m_cutreeStatFileOut;
     FILE*   m_cutreeStatFileIn;
+    ///< store the cutree data in memory instead of file
+    RingMem *m_cutreeShrMem;
     double  m_lastAccumPNorm;
     double  m_expectedBitsSum;   /* sum of qscale2bits after rceq, ratefactor, and overflow, only includes finished frames */
     int64_t m_predictedBits;
@@ -254,6 +257,7 @@
     RateControl(x265_param& p, Encoder *enc);
     bool init(const SPS& sps);
     void initHRD(SPS& sps);
+    void initVBV(const SPS& sps);
     void reconfigureRC();
 
     void setFinalFrameCount(int count);
@@ -271,6 +275,9 @@
     int writeRateControlFrameStats(Frame* curFrame, RateControlEntry* rce);
     bool   initPass2();
 
+    bool initCUTreeSharedMem();
+    void skipCUTreeSharedMemRead(int32_t cnt);
+
     double forwardMasking(Frame* curFrame, double q);
     double backwardMasking(Frame* curFrame, double q);
 
@@ -291,6 +298,7 @@
     double rateEstimateQscale(Frame* pic, RateControlEntry *rce); // main logic for calculating QP based on ABR
     double tuneAbrQScaleFromFeedback(double qScale);
     double tuneQScaleForZone(RateControlEntry *rce, double qScale); // Tune qScale to adhere to zone budget
+    double tuneQscaleForSBRC(Frame* curFrame, double q); // Tune qScale to adhere to segment budget
     void   accumPQpUpdate();
 
     int    getPredictorType(int lowresSliceType, int sliceType);
@@ -311,6 +319,7 @@
     double tuneQScaleForGrain(double rcOverflow);
     void   splitdeltaPOC(char deltapoc, RateControlEntry *rce);
     void   splitbUsed(char deltapoc, RateControlEntry *rce);
+    void   checkAndResetCRF(RateControlEntry* rce);
 };
 }
 #endif // ifndef X265_RATECONTROL_H
​

x265_3.5.tar.gz/source/encoder/sei.cpp -> x265_3.6.tar.gz/source/encoder/sei.cpp Changed

 
@@ -68,7 +68,7 @@
     {
         if (nalUnitType != NAL_UNIT_UNSPECIFIED)
             bs.writeByteAlignment();
-        list.serialize(nalUnitType, bs);
+        list.serialize(nalUnitType, bs, (1 + (nalUnitType == NAL_UNIT_CODED_SLICE_TSA_N)));
     }
 }
 
​

x265_3.5.tar.gz/source/encoder/sei.h -> x265_3.6.tar.gz/source/encoder/sei.h Changed

@@ -73,6 +73,101 @@
     }
 };
 
+/* Film grain characteristics */
+class FilmGrainCharacteristics : public SEI
+{
+  public:
+
+    FilmGrainCharacteristics()
+    {
+        m_payloadType = FILM_GRAIN_CHARACTERISTICS;
+        m_payloadSize = 0;
+    }
+
+    struct CompModelIntensityValues
+    {
+        uint8_t intensityIntervalLowerBound;
+        uint8_t intensityIntervalUpperBound;
+        int*    compModelValue;
+    };
+
+    struct CompModel
+    {
+        bool    bPresentFlag;
+        uint8_t numModelValues;
+        uint8_t m_filmGrainNumIntensityIntervalMinus1;
+        CompModelIntensityValues* intensityValues;
+    };
+
+    CompModel   m_compModelMAX_NUM_COMPONENT;
+    bool        m_filmGrainCharacteristicsPersistenceFlag;
+    bool        m_filmGrainCharacteristicsCancelFlag;
+    bool        m_separateColourDescriptionPresentFlag;
+    bool        m_filmGrainFullRangeFlag;
+    uint8_t     m_filmGrainModelId;
+    uint8_t     m_blendingModeId;
+    uint8_t     m_log2ScaleFactor;
+    uint8_t     m_filmGrainBitDepthLumaMinus8;
+    uint8_t     m_filmGrainBitDepthChromaMinus8;
+    uint8_t     m_filmGrainColourPrimaries;
+    uint8_t     m_filmGrainTransferCharacteristics;
+    uint8_t     m_filmGrainMatrixCoeffs;
+
+    void writeSEI(const SPS&)
+    {
+        WRITE_FLAG(m_filmGrainCharacteristicsCancelFlag, "film_grain_characteristics_cancel_flag");
+
+        if (!m_filmGrainCharacteristicsCancelFlag)
+        {
+            WRITE_CODE(m_filmGrainModelId, 2, "film_grain_model_id");
+            WRITE_FLAG(m_separateColourDescriptionPresentFlag, "separate_colour_description_present_flag");
+            if (m_separateColourDescriptionPresentFlag)
+            {
+                WRITE_CODE(m_filmGrainBitDepthLumaMinus8, 3, "film_grain_bit_depth_luma_minus8");
+                WRITE_CODE(m_filmGrainBitDepthChromaMinus8, 3, "film_grain_bit_depth_chroma_minus8");
+                WRITE_FLAG(m_filmGrainFullRangeFlag, "film_grain_full_range_flag");
+                WRITE_CODE(m_filmGrainColourPrimaries, X265_BYTE, "film_grain_colour_primaries");
+                WRITE_CODE(m_filmGrainTransferCharacteristics, X265_BYTE, "film_grain_transfer_characteristics");
+                WRITE_CODE(m_filmGrainMatrixCoeffs, X265_BYTE, "film_grain_matrix_coeffs");
+            }
+            WRITE_CODE(m_blendingModeId, 2, "blending_mode_id");
+            WRITE_CODE(m_log2ScaleFactor, 4, "log2_scale_factor");
+            for (uint8_t c = 0; c < 3; c++)
+            {
+                WRITE_FLAG(m_compModelc.bPresentFlag && m_compModelc.m_filmGrainNumIntensityIntervalMinus1 + 1 > 0 && m_compModelc.numModelValues > 0, "comp_model_present_flagc");
+            }
+            for (uint8_t c = 0; c < 3; c++)
+            {
+                if (m_compModelc.bPresentFlag && m_compModelc.m_filmGrainNumIntensityIntervalMinus1 + 1 > 0 && m_compModelc.numModelValues > 0)
+                {
+                    assert(m_compModelc.m_filmGrainNumIntensityIntervalMinus1 + 1 <= 256);
+                    assert(m_compModelc.numModelValues <= X265_BYTE);
+                    WRITE_CODE(m_compModelc.m_filmGrainNumIntensityIntervalMinus1 , X265_BYTE, "num_intensity_intervals_minus1c");
+                    WRITE_CODE(m_compModelc.numModelValues - 1, 3, "num_model_values_minus1c");
+                    for (uint8_t interval = 0; interval < m_compModelc.m_filmGrainNumIntensityIntervalMinus1 + 1; interval++)
+                    {
+                        WRITE_CODE(m_compModelc.intensityValuesinterval.intensityIntervalLowerBound, X265_BYTE, "intensity_interval_lower_boundci");
+                        WRITE_CODE(m_compModelc.intensityValuesinterval.intensityIntervalUpperBound, X265_BYTE, "intensity_interval_upper_boundci");
+                        for (uint8_t j = 0; j < m_compModelc.numModelValues; j++)
+                        {
+                            WRITE_SVLC(m_compModelc.intensityValuesinterval.compModelValuej,"comp_model_valueci");
+                        }
+                    }
+                }
+            }
+            WRITE_FLAG(m_filmGrainCharacteristicsPersistenceFlag, "film_grain_characteristics_persistence_flag");
+        }
+        if (m_bitIf->getNumberOfWrittenBits() % X265_BYTE != 0)
+        {
+            WRITE_FLAG(1, "payload_bit_equal_to_one");
+            while (m_bitIf->getNumberOfWrittenBits() % X265_BYTE != 0)
+            {
+                WRITE_FLAG(0, "payload_bit_equal_to_zero");
+            }
+        }
+    }
+};
+
 static const uint32_t ISO_IEC_11578_LEN = 16;
 
 class SEIuserDataUnregistered : public SEI

 
@@ -73,6 +73,101 @@
     }
 };
 
+/* Film grain characteristics */
+class FilmGrainCharacteristics : public SEI
+{
+  public:
+
+    FilmGrainCharacteristics()
+    {
+        m_payloadType = FILM_GRAIN_CHARACTERISTICS;
+        m_payloadSize = 0;
+    }
+
+    struct CompModelIntensityValues
+    {
+        uint8_t intensityIntervalLowerBound;
+        uint8_t intensityIntervalUpperBound;
+        int*    compModelValue;
+    };
+
+    struct CompModel
+    {
+        bool    bPresentFlag;
+        uint8_t numModelValues;
+        uint8_t m_filmGrainNumIntensityIntervalMinus1;
+        CompModelIntensityValues* intensityValues;
+    };
+
+    CompModel   m_compModelMAX_NUM_COMPONENT;
+    bool        m_filmGrainCharacteristicsPersistenceFlag;
+    bool        m_filmGrainCharacteristicsCancelFlag;
+    bool        m_separateColourDescriptionPresentFlag;
+    bool        m_filmGrainFullRangeFlag;
+    uint8_t     m_filmGrainModelId;
+    uint8_t     m_blendingModeId;
+    uint8_t     m_log2ScaleFactor;
+    uint8_t     m_filmGrainBitDepthLumaMinus8;
+    uint8_t     m_filmGrainBitDepthChromaMinus8;
+    uint8_t     m_filmGrainColourPrimaries;
+    uint8_t     m_filmGrainTransferCharacteristics;
+    uint8_t     m_filmGrainMatrixCoeffs;
+
+    void writeSEI(const SPS&)
+    {
+        WRITE_FLAG(m_filmGrainCharacteristicsCancelFlag, "film_grain_characteristics_cancel_flag");
+
+        if (!m_filmGrainCharacteristicsCancelFlag)
+        {
+            WRITE_CODE(m_filmGrainModelId, 2, "film_grain_model_id");
+            WRITE_FLAG(m_separateColourDescriptionPresentFlag, "separate_colour_description_present_flag");
+            if (m_separateColourDescriptionPresentFlag)
+            {
+                WRITE_CODE(m_filmGrainBitDepthLumaMinus8, 3, "film_grain_bit_depth_luma_minus8");
+                WRITE_CODE(m_filmGrainBitDepthChromaMinus8, 3, "film_grain_bit_depth_chroma_minus8");
+                WRITE_FLAG(m_filmGrainFullRangeFlag, "film_grain_full_range_flag");
+                WRITE_CODE(m_filmGrainColourPrimaries, X265_BYTE, "film_grain_colour_primaries");
+                WRITE_CODE(m_filmGrainTransferCharacteristics, X265_BYTE, "film_grain_transfer_characteristics");
+                WRITE_CODE(m_filmGrainMatrixCoeffs, X265_BYTE, "film_grain_matrix_coeffs");
+            }
+            WRITE_CODE(m_blendingModeId, 2, "blending_mode_id");
+            WRITE_CODE(m_log2ScaleFactor, 4, "log2_scale_factor");
+            for (uint8_t c = 0; c < 3; c++)
+            {
+                WRITE_FLAG(m_compModelc.bPresentFlag && m_compModelc.m_filmGrainNumIntensityIntervalMinus1 + 1 > 0 && m_compModelc.numModelValues > 0, "comp_model_present_flagc");
+            }
+            for (uint8_t c = 0; c < 3; c++)
+            {
+                if (m_compModelc.bPresentFlag && m_compModelc.m_filmGrainNumIntensityIntervalMinus1 + 1 > 0 && m_compModelc.numModelValues > 0)
+                {
+                    assert(m_compModelc.m_filmGrainNumIntensityIntervalMinus1 + 1 <= 256);
+                    assert(m_compModelc.numModelValues <= X265_BYTE);
+                    WRITE_CODE(m_compModelc.m_filmGrainNumIntensityIntervalMinus1 , X265_BYTE, "num_intensity_intervals_minus1c");
+                    WRITE_CODE(m_compModelc.numModelValues - 1, 3, "num_model_values_minus1c");
+                    for (uint8_t interval = 0; interval < m_compModelc.m_filmGrainNumIntensityIntervalMinus1 + 1; interval++)
+                    {
+                        WRITE_CODE(m_compModelc.intensityValuesinterval.intensityIntervalLowerBound, X265_BYTE, "intensity_interval_lower_boundci");
+                        WRITE_CODE(m_compModelc.intensityValuesinterval.intensityIntervalUpperBound, X265_BYTE, "intensity_interval_upper_boundci");
+                        for (uint8_t j = 0; j < m_compModelc.numModelValues; j++)
+                        {
+                            WRITE_SVLC(m_compModelc.intensityValuesinterval.compModelValuej,"comp_model_valueci");
+                        }
+                    }
+                }
+            }
+            WRITE_FLAG(m_filmGrainCharacteristicsPersistenceFlag, "film_grain_characteristics_persistence_flag");
+        }
+        if (m_bitIf->getNumberOfWrittenBits() % X265_BYTE != 0)
+        {
+            WRITE_FLAG(1, "payload_bit_equal_to_one");
+            while (m_bitIf->getNumberOfWrittenBits() % X265_BYTE != 0)
+            {
+                WRITE_FLAG(0, "payload_bit_equal_to_zero");
+            }
+        }
+    }
+};
+
 static const uint32_t ISO_IEC_11578_LEN = 16;
 
 class SEIuserDataUnregistered : public SEI
​

x265_3.5.tar.gz/source/encoder/slicetype.cpp -> x265_3.6.tar.gz/source/encoder/slicetype.cpp Changed

@@ -87,6 +87,14 @@
 
 namespace X265_NS {
 
+uint32_t acEnergyVarHist(uint64_t sum_ssd, int shift)
+{
+    uint32_t sum = (uint32_t)sum_ssd;
+    uint32_t ssd = (uint32_t)(sum_ssd >> 32);
+
+    return ssd - ((uint64_t)sum * sum >> shift);
+}
+
 bool computeEdge(pixel* edgePic, pixel* refPic, pixel* edgeTheta, intptr_t stride, int height, int width, bool bcalcTheta, pixel whitePixel)
 {
     intptr_t rowOne = 0, rowTwo = 0, rowThree = 0, colOne = 0, colTwo = 0, colThree = 0;
@@ -184,7 +192,7 @@
     {
         for (int colNum = 0; colNum < width; colNum++)
         {
-            if ((rowNum >= 2) && (colNum >= 2) && (rowNum != height - 2) && (colNum != width - 2)) //Ignoring the border pixels of the picture
+            if ((rowNum >= 2) && (colNum >= 2) && (rowNum < height - 2) && (colNum < width - 2)) //Ignoring the border pixels of the picture
             {
                 /*  5x5 Gaussian filter
                     2   4   5   4   2
@@ -519,7 +527,7 @@
                 if (param->rc.aqMode == X265_AQ_EDGE)
                     edgeFilter(curFrame, param);
 
-                if (param->rc.aqMode == X265_AQ_EDGE && !param->bHistBasedSceneCut && param->recursionSkipMode == EDGE_BASED_RSKIP)
+                if (param->rc.aqMode == X265_AQ_EDGE && param->recursionSkipMode == EDGE_BASED_RSKIP)
                 {
                     pixel* src = curFrame->m_edgePic + curFrame->m_fencPic->m_lumaMarginY * curFrame->m_fencPic->m_stride + curFrame->m_fencPic->m_lumaMarginX;
                     primitives.planecopy_pp_shr(src, curFrame->m_fencPic->m_stride, curFrame->m_edgeBitPic,
@@ -1050,7 +1058,48 @@
     m_countPreLookahead = 0;
 #endif
 
-    memset(m_histogram, 0, sizeof(m_histogram));
+    m_accHistDiffRunningAvgCb = X265_MALLOC(uint32_t*, NUMBER_OF_SEGMENTS_IN_WIDTH * sizeof(uint32_t*));
+    m_accHistDiffRunningAvgCb0 = X265_MALLOC(uint32_t, NUMBER_OF_SEGMENTS_IN_WIDTH * NUMBER_OF_SEGMENTS_IN_HEIGHT);
+    memset(m_accHistDiffRunningAvgCb0, 0, sizeof(uint32_t) * NUMBER_OF_SEGMENTS_IN_WIDTH * NUMBER_OF_SEGMENTS_IN_HEIGHT);
+    for (uint32_t w = 1; w < NUMBER_OF_SEGMENTS_IN_WIDTH; w++) {
+        m_accHistDiffRunningAvgCbw = m_accHistDiffRunningAvgCb0 + w * NUMBER_OF_SEGMENTS_IN_HEIGHT;
+    }
+
+    m_accHistDiffRunningAvgCr = X265_MALLOC(uint32_t*, NUMBER_OF_SEGMENTS_IN_WIDTH * sizeof(uint32_t*));
+    m_accHistDiffRunningAvgCr0 = X265_MALLOC(uint32_t, NUMBER_OF_SEGMENTS_IN_WIDTH * NUMBER_OF_SEGMENTS_IN_HEIGHT);
+    memset(m_accHistDiffRunningAvgCr0, 0, sizeof(uint32_t) * NUMBER_OF_SEGMENTS_IN_WIDTH * NUMBER_OF_SEGMENTS_IN_HEIGHT);
+    for (uint32_t w = 1; w < NUMBER_OF_SEGMENTS_IN_WIDTH; w++) {
+        m_accHistDiffRunningAvgCrw = m_accHistDiffRunningAvgCr0 + w * NUMBER_OF_SEGMENTS_IN_HEIGHT;
+    }
+
+    m_accHistDiffRunningAvg = X265_MALLOC(uint32_t*, NUMBER_OF_SEGMENTS_IN_WIDTH * sizeof(uint32_t*));
+    m_accHistDiffRunningAvg0 = X265_MALLOC(uint32_t, NUMBER_OF_SEGMENTS_IN_WIDTH * NUMBER_OF_SEGMENTS_IN_HEIGHT);
+    memset(m_accHistDiffRunningAvg0, 0, sizeof(uint32_t) * NUMBER_OF_SEGMENTS_IN_WIDTH * NUMBER_OF_SEGMENTS_IN_HEIGHT);
+    for (uint32_t w = 1; w < NUMBER_OF_SEGMENTS_IN_WIDTH; w++) {
+        m_accHistDiffRunningAvgw = m_accHistDiffRunningAvg0 + w * NUMBER_OF_SEGMENTS_IN_HEIGHT;
+    }
+
+    m_resetRunningAvg = true;
+
+    m_segmentCountThreshold = (uint32_t)(((float)((NUMBER_OF_SEGMENTS_IN_WIDTH * NUMBER_OF_SEGMENTS_IN_HEIGHT) * 50) / 100) + 0.5);
+
+    if (m_param->bEnableTemporalSubLayers > 2)
+    {
+        switch (m_param->bEnableTemporalSubLayers)
+        {
+        case 3:
+            m_gopId = 0;
+            break;
+        case 4:
+            m_gopId = 1;
+            break;
+        case 5:
+            m_gopId = 2;
+            break;
+        default:
+            break;
+        }
+    }
 }
 
 #if DETAILED_CU_STATS
@@ -1098,6 +1147,7 @@
             m_pooli.stopWorkers();
     }
 }
+
 void Lookahead::destroy()
 {
     // these two queues will be empty unless the encode was aborted
@@ -1309,32 +1359,32 @@
     default:
         return;
     }
-    if (!m_param->analysisLoad || !m_param->bDisableLookahead)
+    if (!curFrame->m_param->analysisLoad || !curFrame->m_param->bDisableLookahead)
     {
         X265_CHECK(curFrame->m_lowres.costEstb - p0p1 - b > 0, "Slice cost not estimated\n")
 
-        if (m_param->rc.cuTree && !m_param->rc.bStatRead)
+        if (curFrame->m_param->rc.cuTree && !curFrame->m_param->rc.bStatRead)
             /* update row satds based on cutree offsets */
             curFrame->m_lowres.satdCost = frameCostRecalculate(frames, p0, p1, b);
-        else if (!m_param->analysisLoad || m_param->scaleFactor || m_param->bAnalysisType == HEVC_INFO)
+        else if (!curFrame->m_param->analysisLoad || curFrame->m_param->scaleFactor || curFrame->m_param->bAnalysisType == HEVC_INFO)
         {
-            if (m_param->rc.aqMode)
+            if (curFrame->m_param->rc.aqMode)
                 curFrame->m_lowres.satdCost = curFrame->m_lowres.costEstAqb - p0p1 - b;
             else
                 curFrame->m_lowres.satdCost = curFrame->m_lowres.costEstb - p0p1 - b;
         }
-        if (m_param->rc.vbvBufferSize && m_param->rc.vbvMaxBitrate)
+        if (curFrame->m_param->rc.vbvBufferSize && curFrame->m_param->rc.vbvMaxBitrate)
         {
             /* aggregate lowres row satds to CTU resolution */
             curFrame->m_lowres.lowresCostForRc = curFrame->m_lowres.lowresCostsb - p0p1 - b;
             uint32_t lowresRow = 0, lowresCol = 0, lowresCuIdx = 0, sum = 0, intraSum = 0;
-            uint32_t scale = m_param->maxCUSize / (2 * X265_LOWRES_CU_SIZE);
-            uint32_t numCuInHeight = (m_param->sourceHeight + m_param->maxCUSize - 1) / m_param->maxCUSize;
+            uint32_t scale = curFrame->m_param->maxCUSize / (2 * X265_LOWRES_CU_SIZE);
+            uint32_t numCuInHeight = (curFrame->m_param->sourceHeight + curFrame->m_param->maxCUSize - 1) / curFrame->m_param->maxCUSize;
             uint32_t widthInLowresCu = (uint32_t)m_8x8Width, heightInLowresCu = (uint32_t)m_8x8Height;
             double *qp_offset = 0;
             /* Factor in qpoffsets based on Aq/Cutree in CU costs */
-            if (m_param->rc.aqMode || m_param->bAQMotion)
-                qp_offset = (framesb->sliceType == X265_TYPE_B || !m_param->rc.cuTree) ? framesb->qpAqOffset : framesb->qpCuTreeOffset;
+            if (curFrame->m_param->rc.aqMode || curFrame->m_param->bAQMotion)
+                qp_offset = (framesb->sliceType == X265_TYPE_B || !curFrame->m_param->rc.cuTree) ? framesb->qpAqOffset : framesb->qpCuTreeOffset;
 
             for (uint32_t row = 0; row < numCuInHeight; row++)
             {
@@ -1350,7 +1400,7 @@
                         if (qp_offset)
                         {
                             double qpOffset;
-                            if (m_param->rc.qgSize == 8)
+                            if (curFrame->m_param->rc.qgSize == 8)
                                 qpOffset = (qp_offsetlowresCol * 2 + lowresRow * widthInLowresCu * 4 +
                                 qp_offsetlowresCol * 2 + lowresRow * widthInLowresCu * 4 + 1 +
                                 qp_offsetlowresCol * 2 + lowresRow * widthInLowresCu * 4 + curFrame->m_lowres.maxBlocksInRowFullRes +
@@ -1361,7 +1411,7 @@
                             int32_t intraCuCost = curFrame->m_lowres.intraCostlowresCuIdx;
                             curFrame->m_lowres.intraCostlowresCuIdx = (intraCuCost * x265_exp2fix8(qpOffset) + 128) >> 8;
                         }
-                        if (m_param->bIntraRefresh && slice->m_sliceType == X265_TYPE_P)
+                        if (curFrame->m_param->bIntraRefresh && slice->m_sliceType == X265_TYPE_P)
                             for (uint32_t x = curFrame->m_encData->m_pir.pirStartCol; x <= curFrame->m_encData->m_pir.pirEndCol; x++)
                                 diff += curFrame->m_lowres.intraCostlowresCuIdx - lowresCuCost;
                         curFrame->m_lowres.lowresCostForRclowresCuIdx = lowresCuCost;
@@ -1377,6 +1427,291 @@
     }
 }
 
+uint32_t LookaheadTLD::calcVariance(pixel* inpSrc, intptr_t stride, intptr_t blockOffset, uint32_t plane)
+{
+    pixel* src = inpSrc + blockOffset;
+
+    uint32_t var;
+    if (!plane)
+        var = acEnergyVarHist(primitives.cuBLOCK_8x8.var(src, stride), 6);
+    else
+        var = acEnergyVarHist(primitives.cuBLOCK_4x4.var(src, stride), 4);
+
+    x265_emms();
+    return var;
+}
+
+/*
+** Compute Block and Picture Variance, Block Mean for all blocks in the picture
+*/
+void LookaheadTLD::computePictureStatistics(Frame *curFrame)
+{
+    int maxCol = curFrame->m_fencPic->m_picWidth;
+    int maxRow = curFrame->m_fencPic->m_picHeight;
+    intptr_t inpStride = curFrame->m_fencPic->m_stride;
+
+    // Variance
+    uint64_t picTotVariance = 0;
+    uint32_t variance;
+
+    uint64_t blockXY = 0;
+    pixel* src = curFrame->m_fencPic->m_picOrg0;
+
+    for (int blockY = 0; blockY < maxRow; blockY += 8)
+    {
+        uint64_t rowVariance = 0;
+        for (int blockX = 0; blockX < maxCol; blockX += 8)
+        {
+            intptr_t blockOffsetLuma = blockX + (blockY * inpStride);
+
+            variance = calcVariance(
+                src,
+                inpStride,
+                blockOffsetLuma, 0);
+
+            rowVariance += variance;
+            blockXY++;
+        }

 
@@ -87,6 +87,14 @@
 
 namespace X265_NS {
 
+uint32_t acEnergyVarHist(uint64_t sum_ssd, int shift)
+{
+    uint32_t sum = (uint32_t)sum_ssd;
+    uint32_t ssd = (uint32_t)(sum_ssd >> 32);
+
+    return ssd - ((uint64_t)sum * sum >> shift);
+}
+
 bool computeEdge(pixel* edgePic, pixel* refPic, pixel* edgeTheta, intptr_t stride, int height, int width, bool bcalcTheta, pixel whitePixel)
 {
     intptr_t rowOne = 0, rowTwo = 0, rowThree = 0, colOne = 0, colTwo = 0, colThree = 0;
@@ -184,7 +192,7 @@
     {
         for (int colNum = 0; colNum < width; colNum++)
         {
-            if ((rowNum >= 2) && (colNum >= 2) && (rowNum != height - 2) && (colNum != width - 2)) //Ignoring the border pixels of the picture
+            if ((rowNum >= 2) && (colNum >= 2) && (rowNum < height - 2) && (colNum < width - 2)) //Ignoring the border pixels of the picture
             {
                 /*  5x5 Gaussian filter
                     2   4   5   4   2
@@ -519,7 +527,7 @@
                 if (param->rc.aqMode == X265_AQ_EDGE)
                     edgeFilter(curFrame, param);
 
-                if (param->rc.aqMode == X265_AQ_EDGE && !param->bHistBasedSceneCut && param->recursionSkipMode == EDGE_BASED_RSKIP)
+                if (param->rc.aqMode == X265_AQ_EDGE && param->recursionSkipMode == EDGE_BASED_RSKIP)
                 {
                     pixel* src = curFrame->m_edgePic + curFrame->m_fencPic->m_lumaMarginY * curFrame->m_fencPic->m_stride + curFrame->m_fencPic->m_lumaMarginX;
                     primitives.planecopy_pp_shr(src, curFrame->m_fencPic->m_stride, curFrame->m_edgeBitPic,
@@ -1050,7 +1058,48 @@
     m_countPreLookahead = 0;
 #endif
 
-    memset(m_histogram, 0, sizeof(m_histogram));
+    m_accHistDiffRunningAvgCb = X265_MALLOC(uint32_t*, NUMBER_OF_SEGMENTS_IN_WIDTH * sizeof(uint32_t*));
+    m_accHistDiffRunningAvgCb0 = X265_MALLOC(uint32_t, NUMBER_OF_SEGMENTS_IN_WIDTH * NUMBER_OF_SEGMENTS_IN_HEIGHT);
+    memset(m_accHistDiffRunningAvgCb0, 0, sizeof(uint32_t) * NUMBER_OF_SEGMENTS_IN_WIDTH * NUMBER_OF_SEGMENTS_IN_HEIGHT);
+    for (uint32_t w = 1; w < NUMBER_OF_SEGMENTS_IN_WIDTH; w++) {
+        m_accHistDiffRunningAvgCbw = m_accHistDiffRunningAvgCb0 + w * NUMBER_OF_SEGMENTS_IN_HEIGHT;
+    }
+
+    m_accHistDiffRunningAvgCr = X265_MALLOC(uint32_t*, NUMBER_OF_SEGMENTS_IN_WIDTH * sizeof(uint32_t*));
+    m_accHistDiffRunningAvgCr0 = X265_MALLOC(uint32_t, NUMBER_OF_SEGMENTS_IN_WIDTH * NUMBER_OF_SEGMENTS_IN_HEIGHT);
+    memset(m_accHistDiffRunningAvgCr0, 0, sizeof(uint32_t) * NUMBER_OF_SEGMENTS_IN_WIDTH * NUMBER_OF_SEGMENTS_IN_HEIGHT);
+    for (uint32_t w = 1; w < NUMBER_OF_SEGMENTS_IN_WIDTH; w++) {
+        m_accHistDiffRunningAvgCrw = m_accHistDiffRunningAvgCr0 + w * NUMBER_OF_SEGMENTS_IN_HEIGHT;
+    }
+
+    m_accHistDiffRunningAvg = X265_MALLOC(uint32_t*, NUMBER_OF_SEGMENTS_IN_WIDTH * sizeof(uint32_t*));
+    m_accHistDiffRunningAvg0 = X265_MALLOC(uint32_t, NUMBER_OF_SEGMENTS_IN_WIDTH * NUMBER_OF_SEGMENTS_IN_HEIGHT);
+    memset(m_accHistDiffRunningAvg0, 0, sizeof(uint32_t) * NUMBER_OF_SEGMENTS_IN_WIDTH * NUMBER_OF_SEGMENTS_IN_HEIGHT);
+    for (uint32_t w = 1; w < NUMBER_OF_SEGMENTS_IN_WIDTH; w++) {
+        m_accHistDiffRunningAvgw = m_accHistDiffRunningAvg0 + w * NUMBER_OF_SEGMENTS_IN_HEIGHT;
+    }
+
+    m_resetRunningAvg = true;
+
+    m_segmentCountThreshold = (uint32_t)(((float)((NUMBER_OF_SEGMENTS_IN_WIDTH * NUMBER_OF_SEGMENTS_IN_HEIGHT) * 50) / 100) + 0.5);
+
+    if (m_param->bEnableTemporalSubLayers > 2)
+    {
+        switch (m_param->bEnableTemporalSubLayers)
+        {
+        case 3:
+            m_gopId = 0;
+            break;
+        case 4:
+            m_gopId = 1;
+            break;
+        case 5:
+            m_gopId = 2;
+            break;
+        default:
+            break;
+        }
+    }
 }
 
 #if DETAILED_CU_STATS
@@ -1098,6 +1147,7 @@
             m_pooli.stopWorkers();
     }
 }
+
 void Lookahead::destroy()
 {
     // these two queues will be empty unless the encode was aborted
@@ -1309,32 +1359,32 @@
     default:
         return;
     }
-    if (!m_param->analysisLoad || !m_param->bDisableLookahead)
+    if (!curFrame->m_param->analysisLoad || !curFrame->m_param->bDisableLookahead)
     {
         X265_CHECK(curFrame->m_lowres.costEstb - p0p1 - b > 0, "Slice cost not estimated\n")
 
-        if (m_param->rc.cuTree && !m_param->rc.bStatRead)
+        if (curFrame->m_param->rc.cuTree && !curFrame->m_param->rc.bStatRead)
             /* update row satds based on cutree offsets */
             curFrame->m_lowres.satdCost = frameCostRecalculate(frames, p0, p1, b);
-        else if (!m_param->analysisLoad || m_param->scaleFactor || m_param->bAnalysisType == HEVC_INFO)
+        else if (!curFrame->m_param->analysisLoad || curFrame->m_param->scaleFactor || curFrame->m_param->bAnalysisType == HEVC_INFO)
         {
-            if (m_param->rc.aqMode)
+            if (curFrame->m_param->rc.aqMode)
                 curFrame->m_lowres.satdCost = curFrame->m_lowres.costEstAqb - p0p1 - b;
             else
                 curFrame->m_lowres.satdCost = curFrame->m_lowres.costEstb - p0p1 - b;
         }
-        if (m_param->rc.vbvBufferSize && m_param->rc.vbvMaxBitrate)
+        if (curFrame->m_param->rc.vbvBufferSize && curFrame->m_param->rc.vbvMaxBitrate)
         {
             /* aggregate lowres row satds to CTU resolution */
             curFrame->m_lowres.lowresCostForRc = curFrame->m_lowres.lowresCostsb - p0p1 - b;
             uint32_t lowresRow = 0, lowresCol = 0, lowresCuIdx = 0, sum = 0, intraSum = 0;
-            uint32_t scale = m_param->maxCUSize / (2 * X265_LOWRES_CU_SIZE);
-            uint32_t numCuInHeight = (m_param->sourceHeight + m_param->maxCUSize - 1) / m_param->maxCUSize;
+            uint32_t scale = curFrame->m_param->maxCUSize / (2 * X265_LOWRES_CU_SIZE);
+            uint32_t numCuInHeight = (curFrame->m_param->sourceHeight + curFrame->m_param->maxCUSize - 1) / curFrame->m_param->maxCUSize;
             uint32_t widthInLowresCu = (uint32_t)m_8x8Width, heightInLowresCu = (uint32_t)m_8x8Height;
             double *qp_offset = 0;
             /* Factor in qpoffsets based on Aq/Cutree in CU costs */
-            if (m_param->rc.aqMode || m_param->bAQMotion)
-                qp_offset = (framesb->sliceType == X265_TYPE_B || !m_param->rc.cuTree) ? framesb->qpAqOffset : framesb->qpCuTreeOffset;
+            if (curFrame->m_param->rc.aqMode || curFrame->m_param->bAQMotion)
+                qp_offset = (framesb->sliceType == X265_TYPE_B || !curFrame->m_param->rc.cuTree) ? framesb->qpAqOffset : framesb->qpCuTreeOffset;
 
             for (uint32_t row = 0; row < numCuInHeight; row++)
             {
@@ -1350,7 +1400,7 @@
                         if (qp_offset)
                         {
                             double qpOffset;
-                            if (m_param->rc.qgSize == 8)
+                            if (curFrame->m_param->rc.qgSize == 8)
                                 qpOffset = (qp_offsetlowresCol * 2 + lowresRow * widthInLowresCu * 4 +
                                 qp_offsetlowresCol * 2 + lowresRow * widthInLowresCu * 4 + 1 +
                                 qp_offsetlowresCol * 2 + lowresRow * widthInLowresCu * 4 + curFrame->m_lowres.maxBlocksInRowFullRes +
@@ -1361,7 +1411,7 @@
                             int32_t intraCuCost = curFrame->m_lowres.intraCostlowresCuIdx;
                             curFrame->m_lowres.intraCostlowresCuIdx = (intraCuCost * x265_exp2fix8(qpOffset) + 128) >> 8;
                         }
-                        if (m_param->bIntraRefresh && slice->m_sliceType == X265_TYPE_P)
+                        if (curFrame->m_param->bIntraRefresh && slice->m_sliceType == X265_TYPE_P)
                             for (uint32_t x = curFrame->m_encData->m_pir.pirStartCol; x <= curFrame->m_encData->m_pir.pirEndCol; x++)
                                 diff += curFrame->m_lowres.intraCostlowresCuIdx - lowresCuCost;
                         curFrame->m_lowres.lowresCostForRclowresCuIdx = lowresCuCost;
@@ -1377,6 +1427,291 @@
     }
 }
 
+uint32_t LookaheadTLD::calcVariance(pixel* inpSrc, intptr_t stride, intptr_t blockOffset, uint32_t plane)
+{
+    pixel* src = inpSrc + blockOffset;
+
+    uint32_t var;
+    if (!plane)
+        var = acEnergyVarHist(primitives.cuBLOCK_8x8.var(src, stride), 6);
+    else
+        var = acEnergyVarHist(primitives.cuBLOCK_4x4.var(src, stride), 4);
+
+    x265_emms();
+    return var;
+}
+
+/*
+** Compute Block and Picture Variance, Block Mean for all blocks in the picture
+*/
+void LookaheadTLD::computePictureStatistics(Frame *curFrame)
+{
+    int maxCol = curFrame->m_fencPic->m_picWidth;
+    int maxRow = curFrame->m_fencPic->m_picHeight;
+    intptr_t inpStride = curFrame->m_fencPic->m_stride;
+
+    // Variance
+    uint64_t picTotVariance = 0;
+    uint32_t variance;
+
+    uint64_t blockXY = 0;
+    pixel* src = curFrame->m_fencPic->m_picOrg0;
+
+    for (int blockY = 0; blockY < maxRow; blockY += 8)
+    {
+        uint64_t rowVariance = 0;
+        for (int blockX = 0; blockX < maxCol; blockX += 8)
+        {
+            intptr_t blockOffsetLuma = blockX + (blockY * inpStride);
+
+            variance = calcVariance(
+                src,
+                inpStride,
+                blockOffsetLuma, 0);
+
+            rowVariance += variance;
+            blockXY++;
+        }
​

x265_3.5.tar.gz/source/encoder/slicetype.h -> x265_3.6.tar.gz/source/encoder/slicetype.h Changed

@@ -44,6 +44,24 @@
 #define EDGE_INCLINATION 45
 #define TEMPORAL_SCENECUT_THRESHOLD 50
 
+#define X265_ABS(a)                        (((a) < 0) ? (-(a)) : (a))
+
+#define PICTURE_DIFF_VARIANCE_TH            390
+#define PICTURE_VARIANCE_TH                 1500
+#define LOW_VAR_SCENE_CHANGE_TH             2250
+#define HIGH_VAR_SCENE_CHANGE_TH            3500
+
+#define PICTURE_DIFF_VARIANCE_CHROMA_TH     10
+#define PICTURE_VARIANCE_CHROMA_TH          20
+#define LOW_VAR_SCENE_CHANGE_CHROMA_TH      2250/4
+#define HIGH_VAR_SCENE_CHANGE_CHROMA_TH     3500/4
+
+#define FLASH_TH                            1.5
+#define FADE_TH                             4
+#define INTENSITY_CHANGE_TH                 4
+
+#define NUM64x64INPIC(w,h)                  ((w*h)>> (MAX_LOG2_CU_SIZE<<1))
+
 #if HIGH_BIT_DEPTH
 #define EDGE_THRESHOLD 1023.0
 #else
@@ -93,7 +111,29 @@
 
     ~LookaheadTLD() { X265_FREE(wbuffer0); }
 
+    void collectPictureStatistics(Frame *curFrame);
+    void computeIntensityHistogramBinsLuma(Frame *curFrame, uint64_t *sumAvgIntensityTotalSegmentsLuma);
+
+    void computeIntensityHistogramBinsChroma(
+        Frame    *curFrame,
+        uint64_t *sumAverageIntensityCb,
+        uint64_t *sumAverageIntensityCr);
+
+    void calculateHistogram(
+        pixel    *inputSrc,
+        uint32_t  inputWidth,
+        uint32_t  inputHeight,
+        intptr_t  stride,
+        uint8_t   dsFactor,
+        uint32_t *histogram,
+        uint64_t *sum);
+
+    void computePictureStatistics(Frame *curFrame);
+
+    uint32_t calcVariance(pixel* src, intptr_t stride, intptr_t blockOffset, uint32_t plane);
+
     void calcAdaptiveQuantFrame(Frame *curFrame, x265_param* param);
+    void calcFrameSegment(Frame *curFrame);
     void lowresIntraEstimate(Lowres& fenc, uint32_t qgSize);
 
     void weightsAnalyse(Lowres& fenc, Lowres& ref);
@@ -124,7 +164,6 @@
 
     /* pre-lookahead */
     int           m_fullQueueSize;
-    int           m_histogramX265_BFRAME_MAX + 1;
     int           m_lastKeyframe;
     int           m_8x8Width;
     int           m_8x8Height;
@@ -153,6 +192,16 @@
     bool          m_isFadeIn;
     uint64_t      m_fadeCount;
     int           m_fadeStart;
+
+    uint32_t    **m_accHistDiffRunningAvgCb;
+    uint32_t    **m_accHistDiffRunningAvgCr;
+    uint32_t    **m_accHistDiffRunningAvg;
+
+    bool          m_resetRunningAvg;
+    uint32_t      m_segmentCountThreshold;
+
+    int8_t                  m_gopId;
+
     Lookahead(x265_param *param, ThreadPool *pool);
 #if DETAILED_CU_STATS
     int64_t       m_slicetypeDecideElapsedTime;
@@ -174,6 +223,7 @@
 
     void    getEstimatedPictureCost(Frame *pic);
     void    setLookaheadQueue();
+    int     findSliceType(int poc);
 
 protected:
 
@@ -184,6 +234,10 @@
     /* called by slicetypeAnalyse() to make slice decisions */
     bool    scenecut(Lowres **frames, int p0, int p1, bool bRealScenecut, int numFrames);
     bool    scenecutInternal(Lowres **frames, int p0, int p1, bool bRealScenecut);
+
+    bool    histBasedScenecut(Lowres **frames, int p0, int p1, int numFrames);
+    bool    detectHistBasedSceneChange(Lowres **frames, int p0, int p1, int p2);
+
     void    slicetypePath(Lowres **frames, int length, char(*best_paths)X265_LOOKAHEAD_MAX + 1);
     int64_t slicetypePathCost(Lowres **frames, char *path, int64_t threshold);
     int64_t vbvFrameCost(Lowres **frames, int p0, int p1, int b);
@@ -199,6 +253,9 @@
 
     /* called by getEstimatedPictureCost() to finalize cuTree costs */
     int64_t frameCostRecalculate(Lowres **frames, int p0, int p1, int b);
+    /*Compute index for positioning B-Ref frames*/
+    void     placeBref(Frame** frames, int start, int end, int num, int *brefs);
+    void     compCostBref(Lowres **frame, int start, int end, int num);
 };
 
 class PreLookaheadGroup : public BondedTaskGroup

 
@@ -44,6 +44,24 @@
 #define EDGE_INCLINATION 45
 #define TEMPORAL_SCENECUT_THRESHOLD 50
 
+#define X265_ABS(a)                        (((a) < 0) ? (-(a)) : (a))
+
+#define PICTURE_DIFF_VARIANCE_TH            390
+#define PICTURE_VARIANCE_TH                 1500
+#define LOW_VAR_SCENE_CHANGE_TH             2250
+#define HIGH_VAR_SCENE_CHANGE_TH            3500
+
+#define PICTURE_DIFF_VARIANCE_CHROMA_TH     10
+#define PICTURE_VARIANCE_CHROMA_TH          20
+#define LOW_VAR_SCENE_CHANGE_CHROMA_TH      2250/4
+#define HIGH_VAR_SCENE_CHANGE_CHROMA_TH     3500/4
+
+#define FLASH_TH                            1.5
+#define FADE_TH                             4
+#define INTENSITY_CHANGE_TH                 4
+
+#define NUM64x64INPIC(w,h)                  ((w*h)>> (MAX_LOG2_CU_SIZE<<1))
+
 #if HIGH_BIT_DEPTH
 #define EDGE_THRESHOLD 1023.0
 #else
@@ -93,7 +111,29 @@
 
     ~LookaheadTLD() { X265_FREE(wbuffer0); }
 
+    void collectPictureStatistics(Frame *curFrame);
+    void computeIntensityHistogramBinsLuma(Frame *curFrame, uint64_t *sumAvgIntensityTotalSegmentsLuma);
+
+    void computeIntensityHistogramBinsChroma(
+        Frame    *curFrame,
+        uint64_t *sumAverageIntensityCb,
+        uint64_t *sumAverageIntensityCr);
+
+    void calculateHistogram(
+        pixel    *inputSrc,
+        uint32_t  inputWidth,
+        uint32_t  inputHeight,
+        intptr_t  stride,
+        uint8_t   dsFactor,
+        uint32_t *histogram,
+        uint64_t *sum);
+
+    void computePictureStatistics(Frame *curFrame);
+
+    uint32_t calcVariance(pixel* src, intptr_t stride, intptr_t blockOffset, uint32_t plane);
+
     void calcAdaptiveQuantFrame(Frame *curFrame, x265_param* param);
+    void calcFrameSegment(Frame *curFrame);
     void lowresIntraEstimate(Lowres& fenc, uint32_t qgSize);
 
     void weightsAnalyse(Lowres& fenc, Lowres& ref);
@@ -124,7 +164,6 @@
 
     /* pre-lookahead */
     int           m_fullQueueSize;
-    int           m_histogramX265_BFRAME_MAX + 1;
     int           m_lastKeyframe;
     int           m_8x8Width;
     int           m_8x8Height;
@@ -153,6 +192,16 @@
     bool          m_isFadeIn;
     uint64_t      m_fadeCount;
     int           m_fadeStart;
+
+    uint32_t    **m_accHistDiffRunningAvgCb;
+    uint32_t    **m_accHistDiffRunningAvgCr;
+    uint32_t    **m_accHistDiffRunningAvg;
+
+    bool          m_resetRunningAvg;
+    uint32_t      m_segmentCountThreshold;
+
+    int8_t                  m_gopId;
+
     Lookahead(x265_param *param, ThreadPool *pool);
 #if DETAILED_CU_STATS
     int64_t       m_slicetypeDecideElapsedTime;
@@ -174,6 +223,7 @@
 
     void    getEstimatedPictureCost(Frame *pic);
     void    setLookaheadQueue();
+    int     findSliceType(int poc);
 
 protected:
 
@@ -184,6 +234,10 @@
     /* called by slicetypeAnalyse() to make slice decisions */
     bool    scenecut(Lowres **frames, int p0, int p1, bool bRealScenecut, int numFrames);
     bool    scenecutInternal(Lowres **frames, int p0, int p1, bool bRealScenecut);
+
+    bool    histBasedScenecut(Lowres **frames, int p0, int p1, int numFrames);
+    bool    detectHistBasedSceneChange(Lowres **frames, int p0, int p1, int p2);
+
     void    slicetypePath(Lowres **frames, int length, char(*best_paths)X265_LOOKAHEAD_MAX + 1);
     int64_t slicetypePathCost(Lowres **frames, char *path, int64_t threshold);
     int64_t vbvFrameCost(Lowres **frames, int p0, int p1, int b);
@@ -199,6 +253,9 @@
 
     /* called by getEstimatedPictureCost() to finalize cuTree costs */
     int64_t frameCostRecalculate(Lowres **frames, int p0, int p1, int b);
+    /*Compute index for positioning B-Ref frames*/
+    void     placeBref(Frame** frames, int start, int end, int num, int *brefs);
+    void     compCostBref(Lowres **frame, int start, int end, int num);
 };
 
 class PreLookaheadGroup : public BondedTaskGroup
​

x265_3.5.tar.gz/source/output/output.cpp -> x265_3.6.tar.gz/source/output/output.cpp Changed

 
@@ -30,14 +30,14 @@
 
 using namespace X265_NS;
 
-ReconFile* ReconFile::open(const char *fname, int width, int height, uint32_t bitdepth, uint32_t fpsNum, uint32_t fpsDenom, int csp)
+ReconFile* ReconFile::open(const char *fname, int width, int height, uint32_t bitdepth, uint32_t fpsNum, uint32_t fpsDenom, int csp, int sourceBitDepth)
 {
     const char * s = strrchr(fname, '.');
 
     if (s && !strcmp(s, ".y4m"))
-        return new Y4MOutput(fname, width, height, fpsNum, fpsDenom, csp);
+        return new Y4MOutput(fname, width, height, bitdepth, fpsNum, fpsDenom, csp, sourceBitDepth);
     else
-        return new YUVOutput(fname, width, height, bitdepth, csp);
+        return new YUVOutput(fname, width, height, bitdepth, csp, sourceBitDepth);
 }
 
 OutputFile* OutputFile::open(const char *fname, InputFileInfo& inputInfo)
​

x265_3.5.tar.gz/source/output/output.h -> x265_3.6.tar.gz/source/output/output.h Changed

 
@@ -42,7 +42,7 @@
     ReconFile()           {}
 
     static ReconFile* open(const char *fname, int width, int height, uint32_t bitdepth,
-                           uint32_t fpsNum, uint32_t fpsDenom, int csp);
+                           uint32_t fpsNum, uint32_t fpsDenom, int csp, int sourceBitDepth);
 
     virtual bool isFail() const = 0;
 
​

x265_3.5.tar.gz/source/output/y4m.cpp -> x265_3.6.tar.gz/source/output/y4m.cpp Changed

@@ -28,11 +28,13 @@
 using namespace X265_NS;
 using namespace std;
 
-Y4MOutput::Y4MOutput(const char *filename, int w, int h, uint32_t fpsNum, uint32_t fpsDenom, int csp)
+Y4MOutput::Y4MOutput(const char* filename, int w, int h, uint32_t bitdepth, uint32_t fpsNum, uint32_t fpsDenom, int csp, int inputdepth)
     : width(w)
     , height(h)
+    , bitDepth(bitdepth)
     , colorSpace(csp)
     , frameSize(0)
+    , inputDepth(inputdepth)
 {
     ofs.open(filename, ios::binary | ios::out);
     buf = new charwidth;
@@ -41,7 +43,13 @@
 
     if (ofs)
     {
-        ofs << "YUV4MPEG2 W" << width << " H" << height << " F" << fpsNum << ":" << fpsDenom << " Ip" << " C" << cf << "\n";
+        if (bitDepth == 10)
+            ofs << "YUV4MPEG2 W" << width << " H" << height << " F" << fpsNum << ":" << fpsDenom << " Ip" << " C" << cf << "p10" << " XYSCSS = " << cf << "P10" << "\n";
+        else if (bitDepth == 12)
+            ofs << "YUV4MPEG2 W" << width << " H" << height << " F" << fpsNum << ":" << fpsDenom << " Ip" << " C" << cf << "p12" << " XYSCSS = " << cf << "P12" << "\n";
+        else
+            ofs << "YUV4MPEG2 W" << width << " H" << height << " F" << fpsNum << ":" << fpsDenom << " Ip" << " C" << cf << "\n";
+
         header = ofs.tellp();
     }
 
@@ -58,52 +66,81 @@
 bool Y4MOutput::writePicture(const x265_picture& pic)
 {
     std::ofstream::pos_type outPicPos = header;
-    outPicPos += (uint64_t)pic.poc * (6 + frameSize);
+    if (pic.bitDepth > 8)
+        outPicPos += (uint64_t)(pic.poc * (6 + frameSize * 2));
+    else
+        outPicPos += (uint64_t)pic.poc * (6 + frameSize);
     ofs.seekp(outPicPos);
     ofs << "FRAME\n";
 
-#if HIGH_BIT_DEPTH
-    if (pic.bitDepth > 8 && pic.poc == 0)
-        x265_log(NULL, X265_LOG_WARNING, "y4m: down-shifting reconstructed pixels to 8 bits\n");
-#else
-    if (pic.bitDepth > 8 && pic.poc == 0)
-        x265_log(NULL, X265_LOG_WARNING, "y4m: forcing reconstructed pixels to 8 bits\n");
-#endif
+    if (inputDepth > 8)
+    {
+        if (pic.bitDepth == 8 && pic.poc == 0)
+            x265_log(NULL, X265_LOG_WARNING, "y4m: down-shifting reconstructed pixels to 8 bits\n");
+    }
 
     X265_CHECK(pic.colorSpace == colorSpace, "invalid chroma subsampling\n");
 
-#if HIGH_BIT_DEPTH
-
-    // encoder gave us short pixels, downshift, then write
-    X265_CHECK(pic.bitDepth > 8, "invalid bit depth\n");
-    int shift = pic.bitDepth - 8;
-    for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
+    if (inputDepth > 8)//if HIGH_BIT_DEPTH
     {
-        uint16_t *src = (uint16_t*)pic.planesi;
-        for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
+        if (pic.bitDepth == 8)
         {
-            for (int w = 0; w < width >> x265_cli_cspscolorSpace.widthi; w++)
-                bufw = (char)(srcw >> shift);
-
-            ofs.write(buf, width >> x265_cli_cspscolorSpace.widthi);
-            src += pic.stridei / sizeof(*src);
+            // encoder gave us short pixels, downshift, then write
+            X265_CHECK(pic.bitDepth == 8, "invalid bit depth\n");
+            int shift = pic.bitDepth - 8;
+            for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
+            {
+                char *src = (char*)pic.planesi;
+                for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
+                {
+                    for (int w = 0; w < width >> x265_cli_cspscolorSpace.widthi; w++)
+                        bufw = (char)(srcw >> shift);
+
+                    ofs.write(buf, width >> x265_cli_cspscolorSpace.widthi);
+                    src += pic.stridei / sizeof(*src);
+                }
+            }
+        }
+        else
+        {
+            X265_CHECK(pic.bitDepth > 8, "invalid bit depth\n");
+            for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
+            {
+                uint16_t *src = (uint16_t*)pic.planesi;
+                for (int h = 0; h < (height * 1) >> x265_cli_cspscolorSpace.heighti; h++)
+                {
+                    ofs.write((const char*)src, (width * 2) >> x265_cli_cspscolorSpace.widthi);
+                    src += pic.stridei / sizeof(*src);
+                }
+            }
         }
     }
-
-#else // if HIGH_BIT_DEPTH
-
-    X265_CHECK(pic.bitDepth == 8, "invalid bit depth\n");
-    for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
+    else if (inputDepth == 8 && pic.bitDepth > 8)
     {
-        char *src = (char*)pic.planesi;
-        for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
+        X265_CHECK(pic.bitDepth > 8, "invalid bit depth\n");
+        for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
         {
-            ofs.write(src, width >> x265_cli_cspscolorSpace.widthi);
-            src += pic.stridei / sizeof(*src);
+            uint16_t* src = (uint16_t*)pic.planesi;
+            for (int h = 0; h < (height * 1) >> x265_cli_cspscolorSpace.heighti; h++)
+            {
+                ofs.write((const char*)src, (width * 2) >> x265_cli_cspscolorSpace.widthi);
+                src += pic.stridei / sizeof(*src);
+            }
+        }
+    }
+    else
+    {
+        X265_CHECK(pic.bitDepth == 8, "invalid bit depth\n");
+        for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
+        {
+            char *src = (char*)pic.planesi;
+            for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
+            {
+                ofs.write(src, width >> x265_cli_cspscolorSpace.widthi);
+                src += pic.stridei / sizeof(*src);
+            }
         }
     }
-
-#endif // if HIGH_BIT_DEPTH
 
     return true;
 }

 
@@ -28,11 +28,13 @@
 using namespace X265_NS;
 using namespace std;
 
-Y4MOutput::Y4MOutput(const char *filename, int w, int h, uint32_t fpsNum, uint32_t fpsDenom, int csp)
+Y4MOutput::Y4MOutput(const char* filename, int w, int h, uint32_t bitdepth, uint32_t fpsNum, uint32_t fpsDenom, int csp, int inputdepth)
     : width(w)
     , height(h)
+    , bitDepth(bitdepth)
     , colorSpace(csp)
     , frameSize(0)
+    , inputDepth(inputdepth)
 {
     ofs.open(filename, ios::binary | ios::out);
     buf = new charwidth;
@@ -41,7 +43,13 @@
 
     if (ofs)
     {
-        ofs << "YUV4MPEG2 W" << width << " H" << height << " F" << fpsNum << ":" << fpsDenom << " Ip" << " C" << cf << "\n";
+        if (bitDepth == 10)
+            ofs << "YUV4MPEG2 W" << width << " H" << height << " F" << fpsNum << ":" << fpsDenom << " Ip" << " C" << cf << "p10" << " XYSCSS = " << cf << "P10" << "\n";
+        else if (bitDepth == 12)
+            ofs << "YUV4MPEG2 W" << width << " H" << height << " F" << fpsNum << ":" << fpsDenom << " Ip" << " C" << cf << "p12" << " XYSCSS = " << cf << "P12" << "\n";
+        else
+            ofs << "YUV4MPEG2 W" << width << " H" << height << " F" << fpsNum << ":" << fpsDenom << " Ip" << " C" << cf << "\n";
+
         header = ofs.tellp();
     }
 
@@ -58,52 +66,81 @@
 bool Y4MOutput::writePicture(const x265_picture& pic)
 {
     std::ofstream::pos_type outPicPos = header;
-    outPicPos += (uint64_t)pic.poc * (6 + frameSize);
+    if (pic.bitDepth > 8)
+        outPicPos += (uint64_t)(pic.poc * (6 + frameSize * 2));
+    else
+        outPicPos += (uint64_t)pic.poc * (6 + frameSize);
     ofs.seekp(outPicPos);
     ofs << "FRAME\n";
 
-#if HIGH_BIT_DEPTH
-    if (pic.bitDepth > 8 && pic.poc == 0)
-        x265_log(NULL, X265_LOG_WARNING, "y4m: down-shifting reconstructed pixels to 8 bits\n");
-#else
-    if (pic.bitDepth > 8 && pic.poc == 0)
-        x265_log(NULL, X265_LOG_WARNING, "y4m: forcing reconstructed pixels to 8 bits\n");
-#endif
+    if (inputDepth > 8)
+    {
+        if (pic.bitDepth == 8 && pic.poc == 0)
+            x265_log(NULL, X265_LOG_WARNING, "y4m: down-shifting reconstructed pixels to 8 bits\n");
+    }
 
     X265_CHECK(pic.colorSpace == colorSpace, "invalid chroma subsampling\n");
 
-#if HIGH_BIT_DEPTH
-
-    // encoder gave us short pixels, downshift, then write
-    X265_CHECK(pic.bitDepth > 8, "invalid bit depth\n");
-    int shift = pic.bitDepth - 8;
-    for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
+    if (inputDepth > 8)//if HIGH_BIT_DEPTH
     {
-        uint16_t *src = (uint16_t*)pic.planesi;
-        for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
+        if (pic.bitDepth == 8)
         {
-            for (int w = 0; w < width >> x265_cli_cspscolorSpace.widthi; w++)
-                bufw = (char)(srcw >> shift);
-
-            ofs.write(buf, width >> x265_cli_cspscolorSpace.widthi);
-            src += pic.stridei / sizeof(*src);
+            // encoder gave us short pixels, downshift, then write
+            X265_CHECK(pic.bitDepth == 8, "invalid bit depth\n");
+            int shift = pic.bitDepth - 8;
+            for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
+            {
+                char *src = (char*)pic.planesi;
+                for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
+                {
+                    for (int w = 0; w < width >> x265_cli_cspscolorSpace.widthi; w++)
+                        bufw = (char)(srcw >> shift);
+
+                    ofs.write(buf, width >> x265_cli_cspscolorSpace.widthi);
+                    src += pic.stridei / sizeof(*src);
+                }
+            }
+        }
+        else
+        {
+            X265_CHECK(pic.bitDepth > 8, "invalid bit depth\n");
+            for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
+            {
+                uint16_t *src = (uint16_t*)pic.planesi;
+                for (int h = 0; h < (height * 1) >> x265_cli_cspscolorSpace.heighti; h++)
+                {
+                    ofs.write((const char*)src, (width * 2) >> x265_cli_cspscolorSpace.widthi);
+                    src += pic.stridei / sizeof(*src);
+                }
+            }
         }
     }
-
-#else // if HIGH_BIT_DEPTH
-
-    X265_CHECK(pic.bitDepth == 8, "invalid bit depth\n");
-    for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
+    else if (inputDepth == 8 && pic.bitDepth > 8)
     {
-        char *src = (char*)pic.planesi;
-        for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
+        X265_CHECK(pic.bitDepth > 8, "invalid bit depth\n");
+        for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
         {
-            ofs.write(src, width >> x265_cli_cspscolorSpace.widthi);
-            src += pic.stridei / sizeof(*src);
+            uint16_t* src = (uint16_t*)pic.planesi;
+            for (int h = 0; h < (height * 1) >> x265_cli_cspscolorSpace.heighti; h++)
+            {
+                ofs.write((const char*)src, (width * 2) >> x265_cli_cspscolorSpace.widthi);
+                src += pic.stridei / sizeof(*src);
+            }
+        }
+    }
+    else
+    {
+        X265_CHECK(pic.bitDepth == 8, "invalid bit depth\n");
+        for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
+        {
+            char *src = (char*)pic.planesi;
+            for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
+            {
+                ofs.write(src, width >> x265_cli_cspscolorSpace.widthi);
+                src += pic.stridei / sizeof(*src);
+            }
         }
     }
-
-#endif // if HIGH_BIT_DEPTH
 
     return true;
 }
​

x265_3.5.tar.gz/source/output/y4m.h -> x265_3.6.tar.gz/source/output/y4m.h Changed

 
@@ -38,10 +38,14 @@
 
     int height;
 
+    uint32_t bitDepth;
+
     int colorSpace;
 
     uint32_t frameSize;
 
+    int inputDepth;
+
     std::ofstream ofs;
 
     std::ofstream::pos_type header;
@@ -52,7 +56,7 @@
 
 public:
 
-    Y4MOutput(const char *filename, int width, int height, uint32_t fpsNum, uint32_t fpsDenom, int csp);
+    Y4MOutput(const char *filename, int width, int height, uint32_t bitdepth, uint32_t fpsNum, uint32_t fpsDenom, int csp, int inputDepth);
 
     virtual ~Y4MOutput();
 
​

x265_3.5.tar.gz/source/output/yuv.cpp -> x265_3.6.tar.gz/source/output/yuv.cpp Changed

@@ -28,12 +28,13 @@
 using namespace X265_NS;
 using namespace std;
 
-YUVOutput::YUVOutput(const char *filename, int w, int h, uint32_t d, int csp)
+YUVOutput::YUVOutput(const char *filename, int w, int h, uint32_t d, int csp, int inputdepth)
     : width(w)
     , height(h)
     , depth(d)
     , colorSpace(csp)
     , frameSize(0)
+    , inputDepth(inputdepth)
 {
     ofs.open(filename, ios::binary | ios::out);
     buf = new charwidth;
@@ -56,50 +57,52 @@
     X265_CHECK(pic.colorSpace == colorSpace, "invalid chroma subsampling\n");
     X265_CHECK(pic.bitDepth == (int)depth, "invalid bit depth\n");
 
-#if HIGH_BIT_DEPTH
-    if (depth == 8)
+    if (inputDepth > 8)
     {
-        int shift = pic.bitDepth - 8;
-        ofs.seekp((std::streamoff)fileOffset);
-        for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
-        {
-            uint16_t *src = (uint16_t*)pic.planesi;
-            for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
-            {
-                for (int w = 0; w < width >> x265_cli_cspscolorSpace.widthi; w++)
-                    bufw = (char)(srcw >> shift);
+	if (depth == 8)
+	{
+		int shift = pic.bitDepth - 8;
+		ofs.seekp((std::streamoff)fileOffset);
+		for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
+		{
+			uint16_t *src = (uint16_t*)pic.planesi;
+			for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
+			{
+				for (int w = 0; w < width >> x265_cli_cspscolorSpace.widthi; w++)
+					bufw = (char)(srcw >> shift);
 
-                ofs.write(buf, width >> x265_cli_cspscolorSpace.widthi);
-                src += pic.stridei / sizeof(*src);
-            }
-        }
+				ofs.write(buf, width >> x265_cli_cspscolorSpace.widthi);
+				src += pic.stridei / sizeof(*src);
+			}
+		}
+	}
+	else
+	{
+		ofs.seekp((std::streamoff)(fileOffset * 2));
+		for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
+		{
+			uint16_t *src = (uint16_t*)pic.planesi;
+			for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
+			{
+				ofs.write((const char*)src, (width * 2) >> x265_cli_cspscolorSpace.widthi);
+				src += pic.stridei / sizeof(*src);
+			}
+		}
+	}
     }
     else
     {
-        ofs.seekp((std::streamoff)(fileOffset * 2));
-        for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
-        {
-            uint16_t *src = (uint16_t*)pic.planesi;
-            for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
-            {
-                ofs.write((const char*)src, (width * 2) >> x265_cli_cspscolorSpace.widthi);
-                src += pic.stridei / sizeof(*src);
-            }
-        }
+	ofs.seekp((std::streamoff)fileOffset);
+	for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
+	{
+		char *src = (char*)pic.planesi;
+		for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
+		{
+			ofs.write(src, width >> x265_cli_cspscolorSpace.widthi);
+			src += pic.stridei / sizeof(*src);
+		}
+	}
     }
-#else // if HIGH_BIT_DEPTH
-    ofs.seekp((std::streamoff)fileOffset);
-    for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
-    {
-        char *src = (char*)pic.planesi;
-        for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
-        {
-            ofs.write(src, width >> x265_cli_cspscolorSpace.widthi);
-            src += pic.stridei / sizeof(*src);
-        }
-    }
-
-#endif // if HIGH_BIT_DEPTH
 
     return true;
 }

 
@@ -28,12 +28,13 @@
 using namespace X265_NS;
 using namespace std;
 
-YUVOutput::YUVOutput(const char *filename, int w, int h, uint32_t d, int csp)
+YUVOutput::YUVOutput(const char *filename, int w, int h, uint32_t d, int csp, int inputdepth)
     : width(w)
     , height(h)
     , depth(d)
     , colorSpace(csp)
     , frameSize(0)
+    , inputDepth(inputdepth)
 {
     ofs.open(filename, ios::binary | ios::out);
     buf = new charwidth;
@@ -56,50 +57,52 @@
     X265_CHECK(pic.colorSpace == colorSpace, "invalid chroma subsampling\n");
     X265_CHECK(pic.bitDepth == (int)depth, "invalid bit depth\n");
 
-#if HIGH_BIT_DEPTH
-    if (depth == 8)
+    if (inputDepth > 8)
     {
-        int shift = pic.bitDepth - 8;
-        ofs.seekp((std::streamoff)fileOffset);
-        for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
-        {
-            uint16_t *src = (uint16_t*)pic.planesi;
-            for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
-            {
-                for (int w = 0; w < width >> x265_cli_cspscolorSpace.widthi; w++)
-                    bufw = (char)(srcw >> shift);
+   if (depth == 8)
+   {
+       int shift = pic.bitDepth - 8;
+       ofs.seekp((std::streamoff)fileOffset);
+       for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
+       {
+           uint16_t *src = (uint16_t*)pic.planesi;
+           for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
+           {
+               for (int w = 0; w < width >> x265_cli_cspscolorSpace.widthi; w++)
+                   bufw = (char)(srcw >> shift);
 
-                ofs.write(buf, width >> x265_cli_cspscolorSpace.widthi);
-                src += pic.stridei / sizeof(*src);
-            }
-        }
+               ofs.write(buf, width >> x265_cli_cspscolorSpace.widthi);
+               src += pic.stridei / sizeof(*src);
+           }
+       }
+   }
+   else
+   {
+       ofs.seekp((std::streamoff)(fileOffset * 2));
+       for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
+       {
+           uint16_t *src = (uint16_t*)pic.planesi;
+           for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
+           {
+               ofs.write((const char*)src, (width * 2) >> x265_cli_cspscolorSpace.widthi);
+               src += pic.stridei / sizeof(*src);
+           }
+       }
+   }
     }
     else
     {
-        ofs.seekp((std::streamoff)(fileOffset * 2));
-        for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
-        {
-            uint16_t *src = (uint16_t*)pic.planesi;
-            for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
-            {
-                ofs.write((const char*)src, (width * 2) >> x265_cli_cspscolorSpace.widthi);
-                src += pic.stridei / sizeof(*src);
-            }
-        }
+   ofs.seekp((std::streamoff)fileOffset);
+   for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
+   {
+       char *src = (char*)pic.planesi;
+       for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
+       {
+           ofs.write(src, width >> x265_cli_cspscolorSpace.widthi);
+           src += pic.stridei / sizeof(*src);
+       }
+   }
     }
-#else // if HIGH_BIT_DEPTH
-    ofs.seekp((std::streamoff)fileOffset);
-    for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
-    {
-        char *src = (char*)pic.planesi;
-        for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
-        {
-            ofs.write(src, width >> x265_cli_cspscolorSpace.widthi);
-            src += pic.stridei / sizeof(*src);
-        }
-    }
-
-#endif // if HIGH_BIT_DEPTH
 
     return true;
 }
​

x265_3.5.tar.gz/source/output/yuv.h -> x265_3.6.tar.gz/source/output/yuv.h Changed

 
@@ -46,13 +46,15 @@
 
     uint32_t frameSize;
 
+    int inputDepth;
+
     char *buf;
 
     std::ofstream ofs;
 
 public:
 
-    YUVOutput(const char *filename, int width, int height, uint32_t bitdepth, int csp);
+    YUVOutput(const char *filename, int width, int height, uint32_t bitdepth, int csp, int inputDepth);
 
     virtual ~YUVOutput();
 
​

x265_3.5.tar.gz/source/test/CMakeLists.txt -> x265_3.6.tar.gz/source/test/CMakeLists.txt Changed

 
@@ -23,15 +23,13 @@
 
 # add ARM assembly files
 if(ARM OR CROSS_COMPILE_ARM)
-    if(NOT ARM64)
-        enable_language(ASM)
-        set(NASM_SRC checkasm-arm.S)
-        add_custom_command(
-            OUTPUT checkasm-arm.obj
-            COMMAND ${CMAKE_CXX_COMPILER}
-            ARGS ${NASM_FLAGS} ${CMAKE_CURRENT_SOURCE_DIR}/checkasm-arm.S -o checkasm-arm.obj
-            DEPENDS checkasm-arm.S)
-    endif()
+    enable_language(ASM)
+    set(NASM_SRC checkasm-arm.S)
+    add_custom_command(
+        OUTPUT checkasm-arm.obj
+        COMMAND ${CMAKE_CXX_COMPILER}
+        ARGS ${NASM_FLAGS} ${CMAKE_CURRENT_SOURCE_DIR}/checkasm-arm.S -o checkasm-arm.obj
+        DEPENDS checkasm-arm.S)
 endif(ARM OR CROSS_COMPILE_ARM)
 
 # add PowerPC assembly files
​

x265_3.5.tar.gz/source/test/pixelharness.cpp -> x265_3.6.tar.gz/source/test/pixelharness.cpp Changed

@@ -406,6 +406,32 @@
     return true;
 }
 
+bool PixelHarness::check_downscaleluma_t(downscaleluma_t ref, downscaleluma_t opt)
+{
+    ALIGN_VAR_16(pixel, ref_destf32 * 32);
+    ALIGN_VAR_16(pixel, opt_destf32 * 32);
+
+    intptr_t src_stride = 64;
+    intptr_t dst_stride = 32;
+    int bx = 32;
+    int by = 32;
+    int j = 0;
+    for (int i = 0; i < ITERS; i++)
+    {
+        int index = i % TEST_CASES;
+        ref(pixel_test_buffindex + j, ref_destf, src_stride, dst_stride, bx, by);
+        checked(opt, pixel_test_buffindex + j, opt_destf, src_stride, dst_stride, bx, by);
+
+        if (memcmp(ref_destf, opt_destf, 32 * 32 * sizeof(pixel)))
+            return false;
+
+        reportfail();
+        j += INCR;
+    }
+
+    return true;
+}
+
 bool PixelHarness::check_cpy2Dto1D_shl_t(cpy2Dto1D_shl_t ref, cpy2Dto1D_shl_t opt)
 {
     ALIGN_VAR_16(int16_t, ref_dest64 * 64);
@@ -2793,6 +2819,15 @@
         }
     }
 
+    if (opt.frameSubSampleLuma)
+    {
+        if (!check_downscaleluma_t(ref.frameSubSampleLuma, opt.frameSubSampleLuma))
+        {
+            printf("SubSample Luma failed!\n");
+            return false;
+        }
+    }
+
     if (opt.scale1D_128to64NONALIGNED)
     {
         if (!check_scale1D_pp(ref.scale1D_128to64NONALIGNED, opt.scale1D_128to64NONALIGNED))
@@ -3492,6 +3527,12 @@
         REPORT_SPEEDUP(opt.frameInitLowres, ref.frameInitLowres, pbuf2, pbuf1, pbuf2, pbuf3, pbuf4, 64, 64, 64, 64);
     }
 
+    if (opt.frameSubSampleLuma)
+    {
+        HEADER0("downscaleluma");
+        REPORT_SPEEDUP(opt.frameSubSampleLuma, ref.frameSubSampleLuma, pbuf2, pbuf1, 64, 64, 64, 64);
+    }
+
     if (opt.scale1D_128to64NONALIGNED)
     {
         HEADER0("scale1D_128to64");

 
@@ -406,6 +406,32 @@
     return true;
 }
 
+bool PixelHarness::check_downscaleluma_t(downscaleluma_t ref, downscaleluma_t opt)
+{
+    ALIGN_VAR_16(pixel, ref_destf32 * 32);
+    ALIGN_VAR_16(pixel, opt_destf32 * 32);
+
+    intptr_t src_stride = 64;
+    intptr_t dst_stride = 32;
+    int bx = 32;
+    int by = 32;
+    int j = 0;
+    for (int i = 0; i < ITERS; i++)
+    {
+        int index = i % TEST_CASES;
+        ref(pixel_test_buffindex + j, ref_destf, src_stride, dst_stride, bx, by);
+        checked(opt, pixel_test_buffindex + j, opt_destf, src_stride, dst_stride, bx, by);
+
+        if (memcmp(ref_destf, opt_destf, 32 * 32 * sizeof(pixel)))
+            return false;
+
+        reportfail();
+        j += INCR;
+    }
+
+    return true;
+}
+
 bool PixelHarness::check_cpy2Dto1D_shl_t(cpy2Dto1D_shl_t ref, cpy2Dto1D_shl_t opt)
 {
     ALIGN_VAR_16(int16_t, ref_dest64 * 64);
@@ -2793,6 +2819,15 @@
         }
     }
 
+    if (opt.frameSubSampleLuma)
+    {
+        if (!check_downscaleluma_t(ref.frameSubSampleLuma, opt.frameSubSampleLuma))
+        {
+            printf("SubSample Luma failed!\n");
+            return false;
+        }
+    }
+
     if (opt.scale1D_128to64NONALIGNED)
     {
         if (!check_scale1D_pp(ref.scale1D_128to64NONALIGNED, opt.scale1D_128to64NONALIGNED))
@@ -3492,6 +3527,12 @@
         REPORT_SPEEDUP(opt.frameInitLowres, ref.frameInitLowres, pbuf2, pbuf1, pbuf2, pbuf3, pbuf4, 64, 64, 64, 64);
     }
 
+    if (opt.frameSubSampleLuma)
+    {
+        HEADER0("downscaleluma");
+        REPORT_SPEEDUP(opt.frameSubSampleLuma, ref.frameSubSampleLuma, pbuf2, pbuf1, 64, 64, 64, 64);
+    }
+
     if (opt.scale1D_128to64NONALIGNED)
     {
         HEADER0("scale1D_128to64");
​

x265_3.5.tar.gz/source/test/pixelharness.h -> x265_3.6.tar.gz/source/test/pixelharness.h Changed

 
@@ -138,6 +138,7 @@
     bool check_integral_inith(integralh_t ref, integralh_t opt);
     bool check_ssimDist(ssimDistortion_t ref, ssimDistortion_t opt);
     bool check_normFact(normFactor_t ref, normFactor_t opt, int block);
+    bool check_downscaleluma_t(downscaleluma_t ref, downscaleluma_t opt);
 
 public:
 
​

x265_3.5.tar.gz/source/test/rate-control-tests.txt -> x265_3.6.tar.gz/source/test/rate-control-tests.txt Changed

@@ -15,7 +15,7 @@
 112_1920x1080_25.yuv,--preset ultrafast --bitrate 10000 --vbv-maxrate 10000 --vbv-bufsize 15000 --hrd --strict-cbr
 Traffic_4096x2048_30.yuv,--preset superfast --bitrate 20000 --vbv-maxrate 20000 --vbv-bufsize 20000 --repeat-headers --strict-cbr
 Traffic_4096x2048_30.yuv,--preset faster --bitrate 8000 --vbv-maxrate 8000 --vbv-bufsize 6000 --aud --repeat-headers --no-open-gop --hrd --pmode --pme
-News-4k.y4m,--preset veryfast --bitrate 3000 --vbv-maxrate 5000 --vbv-bufsize 5000 --repeat-headers --temporal-layers
+News-4k.y4m,--preset veryfast --bitrate 3000 --vbv-maxrate 5000 --vbv-bufsize 5000 --repeat-headers --temporal-layers 3
 NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset medium --bitrate 18000 --vbv-bufsize 20000 --vbv-maxrate 18000 --strict-cbr
 NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset medium --bitrate 8000 --vbv-bufsize 12000 --vbv-maxrate 10000  --tune grain
 big_buck_bunny_360p24.y4m,--preset medium --bitrate 400 --vbv-bufsize 600 --vbv-maxrate 600 --aud --hrd --tune fast-decode

 
@@ -15,7 +15,7 @@
 112_1920x1080_25.yuv,--preset ultrafast --bitrate 10000 --vbv-maxrate 10000 --vbv-bufsize 15000 --hrd --strict-cbr
 Traffic_4096x2048_30.yuv,--preset superfast --bitrate 20000 --vbv-maxrate 20000 --vbv-bufsize 20000 --repeat-headers --strict-cbr
 Traffic_4096x2048_30.yuv,--preset faster --bitrate 8000 --vbv-maxrate 8000 --vbv-bufsize 6000 --aud --repeat-headers --no-open-gop --hrd --pmode --pme
-News-4k.y4m,--preset veryfast --bitrate 3000 --vbv-maxrate 5000 --vbv-bufsize 5000 --repeat-headers --temporal-layers
+News-4k.y4m,--preset veryfast --bitrate 3000 --vbv-maxrate 5000 --vbv-bufsize 5000 --repeat-headers --temporal-layers 3
 NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset medium --bitrate 18000 --vbv-bufsize 20000 --vbv-maxrate 18000 --strict-cbr
 NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset medium --bitrate 8000 --vbv-bufsize 12000 --vbv-maxrate 10000  --tune grain
 big_buck_bunny_360p24.y4m,--preset medium --bitrate 400 --vbv-bufsize 600 --vbv-maxrate 600 --aud --hrd --tune fast-decode
​

x265_3.5.tar.gz/source/test/regression-tests.txt -> x265_3.6.tar.gz/source/test/regression-tests.txt Changed

@@ -18,12 +18,12 @@
 BasketballDrive_1920x1080_50.y4m,--preset faster --aq-strength 2 --merange 190 --slices 3
 BasketballDrive_1920x1080_50.y4m,--preset medium --ctu 16 --max-tu-size 8 --subme 7 --qg-size 16 --cu-lossless --tu-inter-depth 3 --limit-tu 1
 BasketballDrive_1920x1080_50.y4m,--preset medium --keyint -1 --nr-inter 100 -F4 --no-sao
-BasketballDrive_1920x1080_50.y4m,--preset medium --no-cutree --analysis-save x265_analysis.dat --analysis-save-reuse-level 2 --bitrate 7000 --limit-modes::--preset medium --no-cutree --analysis-load x265_analysis.dat --analysis-load-reuse-level 2 --bitrate 7000 --limit-modes
+BasketballDrive_1920x1080_50.y4m,--preset medium --analysis-save x265_analysis.dat --analysis-save-reuse-level 2 --bitrate 7000 --limit-modes::--preset medium --analysis-load x265_analysis.dat --analysis-load-reuse-level 2 --bitrate 7000 --limit-modes
 BasketballDrive_1920x1080_50.y4m,--preset slow --nr-intra 100 -F4 --aq-strength 3 --qg-size 16 --limit-refs 1
 BasketballDrive_1920x1080_50.y4m,--preset slower --lossless --chromaloc 3 --subme 0 --limit-tu 4
-BasketballDrive_1920x1080_50.y4m,--preset slower --no-cutree --analysis-save x265_analysis.dat --analysis-save-reuse-level 10 --bitrate 7000 --limit-tu 0::--preset slower --no-cutree --analysis-load x265_analysis.dat --analysis-load-reuse-level 10 --bitrate 7000 --limit-tu 0
+BasketballDrive_1920x1080_50.y4m,--preset slower --analysis-save x265_analysis.dat --analysis-save-reuse-level 10 --bitrate 7000 --limit-tu 0::--preset slower --analysis-load x265_analysis.dat --analysis-load-reuse-level 10 --bitrate 7000 --limit-tu 0
 BasketballDrive_1920x1080_50.y4m,--preset veryslow --crf 4 --cu-lossless --pmode --limit-refs 1 --aq-mode 3 --limit-tu 3
-BasketballDrive_1920x1080_50.y4m,--preset veryslow --no-cutree --analysis-save x265_analysis.dat --analysis-save-reuse-level 5 --crf 18 --tskip-fast --limit-tu 2::--preset veryslow --no-cutree --analysis-load x265_analysis.dat  --analysis-load-reuse-level 5 --crf 18 --tskip-fast --limit-tu 2
+BasketballDrive_1920x1080_50.y4m,--preset veryslow --analysis-save x265_analysis.dat --analysis-save-reuse-level 5 --crf 18 --tskip-fast --limit-tu 2::--preset veryslow --analysis-load x265_analysis.dat  --analysis-load-reuse-level 5 --crf 18 --tskip-fast --limit-tu 2
 BasketballDrive_1920x1080_50.y4m,--preset veryslow --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
 Coastguard-4k.y4m,--preset ultrafast --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
 Coastguard-4k.y4m,--preset superfast --tune grain --overscan=crop
@@ -33,7 +33,7 @@
 Coastguard-4k.y4m,--preset slow --tune psnr --cbqpoffs -1 --crqpoffs 1 --limit-refs 1
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset ultrafast --weightp --tune zerolatency --qg-size 16
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset superfast --weightp --no-wpp --sao
-CrowdRun_1920x1080_50_10bit_422.yuv,--preset veryfast --temporal-layers --tune grain
+CrowdRun_1920x1080_50_10bit_422.yuv,--preset veryfast --temporal-layers 2 --tune grain
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset faster --max-tu-size 4 --min-cu-size 32
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset fast --aq-mode 0 --sar 2 --range full
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset medium --no-wpp --no-cutree --no-strong-intra-smoothing --limit-refs 1
@@ -41,7 +41,7 @@
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset slower --tune ssim --tune fastdecode --limit-refs 2
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset ultrafast --weightp --no-wpp --no-open-gop
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset superfast --weightp --dither --no-psy-rd
-CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryfast --temporal-layers --repeat-headers --limit-refs 2
+CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryfast --temporal-layers 2 --repeat-headers --limit-refs 2
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset medium --dither --keyint -1 --rdoq-level 1 --limit-modes
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryslow --tskip --tskip-fast --no-scenecut --limit-tu 1
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryslow --aq-mode 3 --aq-strength 1.5 --aq-motion --bitrate 5000
@@ -49,11 +49,11 @@
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryslow --hevc-aq --no-cutree --qg-size 16
 DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset superfast --weightp --qg-size 16
 DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset medium --tune psnr --bframes 16 --limit-modes
-DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset slow --temporal-layers --no-psy-rd --qg-size 32 --limit-refs 0 --cu-lossless
+DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset slow --temporal-layers 2 --no-psy-rd --qg-size 32 --limit-refs 0 --cu-lossless
 DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset veryfast --weightp --nr-intra 1000 -F4
 DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset medium --nr-inter 500 -F4 --no-psy-rdoq
 DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset slower --no-weightp --rdoq-level 0 --limit-refs 3 --tu-inter-depth 4 --limit-tu 3
-DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset fast --no-cutree --analysis-save x265_analysis.dat --analysis-save-reuse-level 5 --bitrate 3000 --early-skip --tu-inter-depth 3 --limit-tu 1::--preset fast --no-cutree --analysis-load x265_analysis.dat  --analysis-load-reuse-level 5 --bitrate 3000 --early-skip --tu-inter-depth 3 --limit-tu 1
+DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset fast --analysis-save x265_analysis.dat --analysis-save-reuse-level 5 --bitrate 3000 --early-skip --tu-inter-depth 3 --limit-tu 1::--preset fast --analysis-load x265_analysis.dat  --analysis-load-reuse-level 5 --bitrate 3000 --early-skip --tu-inter-depth 3 --limit-tu 1
 FourPeople_1280x720_60.y4m,--preset superfast --no-wpp --lookahead-slices 2
 FourPeople_1280x720_60.y4m,--preset veryfast --aq-mode 2 --aq-strength 1.5 --qg-size 8
 FourPeople_1280x720_60.y4m,--preset medium --qp 38 --no-psy-rd
@@ -158,13 +158,10 @@
 ducks_take_off_420_1_720p50.y4m,--preset medium --selective-sao 4 --sao --crf 20
 Traffic_4096x2048_30p.y4m, --preset medium --frame-dup --dup-threshold 60 --hrd --bitrate 10000 --vbv-bufsize 15000 --vbv-maxrate 12000
 Kimono1_1920x1080_24_400.yuv,--preset superfast --qp 28 --zones 0,139,q=32
-sintel_trailer_2k_1920x1080_24.yuv, --preset medium --hist-scenecut --hist-threshold 0.02 --frame-dup --dup-threshold 60 --hrd --bitrate 10000 --vbv-bufsize 15000 --vbv-maxrate 12000
-sintel_trailer_2k_1920x1080_24.yuv, --preset medium --hist-scenecut --hist-threshold 0.02
-sintel_trailer_2k_1920x1080_24.yuv, --preset ultrafast --hist-scenecut --hist-threshold 0.02
 crowd_run_1920x1080_50.yuv, --preset faster --ctu 32 --rskip 2 --rskip-edge-threshold 5
 crowd_run_1920x1080_50.yuv, --preset fast --ctu 64 --rskip 2 --rskip-edge-threshold 5 --aq-mode 4
-crowd_run_1920x1080_50.yuv, --preset slow --ctu 32 --rskip 2 --rskip-edge-threshold 5 --hist-scenecut --hist-threshold 0.1
-crowd_run_1920x1080_50.yuv, --preset slower --ctu 16 --rskip 2 --rskip-edge-threshold 5 --hist-scenecut --hist-threshold 0.1 --aq-mode 4
+crowd_run_1920x1080_50.yuv, --preset ultrafast --video-signal-type-preset BT2100_PQ_YCC:BT2100x108n0005
+crowd_run_1920x1080_50.yuv, --preset ultrafast --eob --eos
  
 # Main12 intraCost overflow bug test
 720p50_parkrun_ter.y4m,--preset medium
@@ -182,14 +179,22 @@
 
 #scaled save/load test
 crowd_run_1080p50.y4m,--preset ultrafast --no-cutree --analysis-save x265_analysis.dat  --analysis-save-reuse-level 1 --scale-factor 2 --crf 26 --vbv-maxrate 8000 --vbv-bufsize 8000::crowd_run_2160p50.y4m, --preset ultrafast --no-cutree --analysis-load x265_analysis.dat  --analysis-load-reuse-level 1 --scale-factor 2 --crf 26 --vbv-maxrate 12000 --vbv-bufsize 12000 
-crowd_run_1080p50.y4m,--preset superfast --no-cutree --analysis-save x265_analysis.dat  --analysis-save-reuse-level 2 --scale-factor 2 --crf 22 --vbv-maxrate 5000 --vbv-bufsize 5000::crowd_run_2160p50.y4m, --preset superfast --no-cutree --analysis-load x265_analysis.dat  --analysis-load-reuse-level 2 --scale-factor 2 --crf 22 --vbv-maxrate 10000 --vbv-bufsize 10000 
-crowd_run_1080p50.y4m,--preset fast --no-cutree --analysis-save x265_analysis.dat  --analysis-save-reuse-level 5 --scale-factor 2 --qp 18::crowd_run_2160p50.y4m, --preset fast --no-cutree --analysis-load x265_analysis.dat  --analysis-load-reuse-level 5 --scale-factor 2 --qp 18
+crowd_run_1080p50.y4m,--preset superfast --analysis-save x265_analysis.dat  --analysis-save-reuse-level 2 --scale-factor 2 --crf 22 --vbv-maxrate 5000 --vbv-bufsize 5000::crowd_run_2160p50.y4m, --preset superfast --analysis-load x265_analysis.dat  --analysis-load-reuse-level 2 --scale-factor 2 --crf 22 --vbv-maxrate 10000 --vbv-bufsize 10000 
+crowd_run_1080p50.y4m,--preset fast --analysis-save x265_analysis.dat  --analysis-save-reuse-level 5 --scale-factor 2 --qp 18::crowd_run_2160p50.y4m, --preset fast --analysis-load x265_analysis.dat  --analysis-load-reuse-level 5 --scale-factor 2 --qp 18
 crowd_run_1080p50.y4m,--preset medium --no-cutree --analysis-save x265_analysis.dat  --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 5000  --vbv-maxrate 5000 --vbv-bufsize 5000 --early-skip --tu-inter-depth 3::crowd_run_2160p50.y4m, --preset medium --no-cutree --analysis-load x265_analysis.dat  --analysis-load-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-maxrate 10000 --vbv-bufsize 10000 --early-skip --tu-inter-depth 3 --refine-intra 4 --dynamic-refine::crowd_run_2160p50.y4m, --preset medium --no-cutree --analysis-load x265_analysis.dat  --analysis-load-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-maxrate 10000 --vbv-bufsize 10000 --early-skip --tu-inter-depth 3 --refine-intra 3 --refine-inter 3
-RaceHorses_416x240_30.y4m,--preset slow --no-cutree --ctu 16 --analysis-save x265_analysis.dat --analysis-save-reuse-level 10 --scale-factor 2 --crf 22  --vbv-maxrate 1000 --vbv-bufsize 1000::RaceHorses_832x480_30.y4m, --preset slow --no-cutree --ctu 32 --analysis-load x265_analysis.dat  --analysis-save x265_analysis_2.dat --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --crf 16 --vbv-maxrate 4000 --vbv-bufsize 4000 --refine-intra 0 --refine-inter 1::RaceHorses_1664x960_30.y4m,--preset slow --no-cutree --ctu 64 --analysis-load x265_analysis_2.dat  --analysis-load-reuse-level 10 --scale-factor 2 --crf 12 --vbv-maxrate 7000 --vbv-bufsize 7000 --refine-intra 2 --refine-inter 2
+RaceHorses_416x240_30.y4m,--preset slow --ctu 16 --analysis-save x265_analysis.dat --analysis-save-reuse-level 10 --scale-factor 2 --crf 22  --vbv-maxrate 1000 --vbv-bufsize 1000::RaceHorses_832x480_30.y4m, --preset slow --ctu 32 --analysis-load x265_analysis.dat  --analysis-save x265_analysis_2.dat --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --crf 16 --vbv-maxrate 4000 --vbv-bufsize 4000 --refine-intra 0 --refine-inter 1::RaceHorses_1664x960_30.y4m,--preset slow --ctu 64 --analysis-load x265_analysis_2.dat  --analysis-load-reuse-level 10 --scale-factor 2 --crf 12 --vbv-maxrate 7000 --vbv-bufsize 7000 --refine-intra 2 --refine-inter 2
 ElFunete_960x540_60.yuv,--colorprim bt709 --transfer bt709 --chromaloc 2 --aud --repeat-headers --no-opt-qp-pps --no-opt-ref-list-length-pps --wpp --no-interlace --sar 1:1 --min-keyint 60 --no-open-gop --rc-lookahead 180 --bframes 5 --b-intra --ref 4 --cbqpoffs -2 --crqpoffs -2 --lookahead-threads 0 --weightb --qg-size 8 --me star --preset veryslow --frame-threads 1 --b-adapt 2 --aq-mode 3 --rd 6 --pools 15 --colormatrix bt709 --keyint 120 --high-tier --ctu 64 --tune psnr --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500 --analysis-save-reuse-level 10 --analysis-save elfuente_960x540.dat --scale-factor 2::ElFunete_1920x1080_60.yuv,--colorprim bt709 --transfer bt709 --chromaloc 2 --aud --repeat-headers --no-opt-qp-pps --no-opt-ref-list-length-pps --wpp --no-interlace --sar 1:1 --min-keyint 60 --no-open-gop --rc-lookahead 180 --bframes 5 --b-intra --ref 4 --cbqpoffs -2 --crqpoffs -2 --lookahead-threads 0 --weightb --qg-size 8 --me star --preset veryslow --frame-threads 1 --b-adapt 2 --aq-mode 3 --rd 6 --pools 15 --colormatrix bt709 --keyint 120 --high-tier --ctu 64 --tune psnr --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500 --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --analysis-save elfuente_1920x1080.dat --limit-tu 0 --scale-factor 2 --analysis-load elfuente_960x540.dat --refine-intra 4 --refine-inter 2::ElFuente_3840x2160_60.yuv,--colorprim bt709 --transfer bt709 --chromaloc 2 --aud --repeat-headers --no-opt-qp-pps --no-opt-ref-list-length-pps --wpp --no-interlace --sar 1:1 --min-keyint 60 --no-open-gop --rc-lookahead 180 --bframes 5 --b-intra --ref 4 --cbqpoffs -2 --crqpoffs -2 --lookahead-threads 0 --weightb --qg-size 8 --me star --preset veryslow --frame-threads 1 --b-adapt 2 --aq-mode 3 --rd 6 --pools 15 --colormatrix bt709 --keyint 120 --high-tier --ctu 64 --tune=psnr --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000 --analysis-load-reuse-level 10 --limit-tu 0 --scale-factor 2 --analysis-load elfuente_1920x1080.dat --refine-intra 4 --refine-inter 2
 #save/load with ctu distortion refinement
 CrowdRun_1920x1080_50_10bit_422.yuv,--no-cutree --analysis-save x265_analysis.dat --analysis-save-reuse-level 5 --refine-ctu-distortion 1 --bitrate 7000::--no-cutree --analysis-load x265_analysis.dat --refine-ctu-distortion 1 --bitrate 7000 --analysis-load-reuse-level 5
 #segment encoding
 BasketballDrive_1920x1080_50.y4m, --preset ultrafast --no-open-gop --chunk-start 100 --chunk-end 200
 
+#Test FG SEI message addition
+#OldTownCross_1920x1080_50_10bit_422.yuv,--preset slower --tune grain --film-grain "OldTownCross_1920x1080_50_10bit_422.bin"
+#RaceHorses_416x240_30_10bit.yuv,--preset ultrafast --signhide --colormatrix bt709 --film-grain "RaceHorses_416x240_30_10bit.bin"
+
+#Temporal layers tests
+ducks_take_off_420_720p50.y4m,--preset slow --temporal-layers 3 --b-adapt 0
+parkrun_ter_720p50.y4m,--preset medium --temporal-layers 4 --b-adapt 0
+BasketballDrive_1920x1080_50.y4m, --preset medium --no-open-gop --keyint 50 --min-keyint 50 --temporal-layers 5 --b-adapt 0
 # vim: tw=200

 
@@ -18,12 +18,12 @@
 BasketballDrive_1920x1080_50.y4m,--preset faster --aq-strength 2 --merange 190 --slices 3
 BasketballDrive_1920x1080_50.y4m,--preset medium --ctu 16 --max-tu-size 8 --subme 7 --qg-size 16 --cu-lossless --tu-inter-depth 3 --limit-tu 1
 BasketballDrive_1920x1080_50.y4m,--preset medium --keyint -1 --nr-inter 100 -F4 --no-sao
-BasketballDrive_1920x1080_50.y4m,--preset medium --no-cutree --analysis-save x265_analysis.dat --analysis-save-reuse-level 2 --bitrate 7000 --limit-modes::--preset medium --no-cutree --analysis-load x265_analysis.dat --analysis-load-reuse-level 2 --bitrate 7000 --limit-modes
+BasketballDrive_1920x1080_50.y4m,--preset medium --analysis-save x265_analysis.dat --analysis-save-reuse-level 2 --bitrate 7000 --limit-modes::--preset medium --analysis-load x265_analysis.dat --analysis-load-reuse-level 2 --bitrate 7000 --limit-modes
 BasketballDrive_1920x1080_50.y4m,--preset slow --nr-intra 100 -F4 --aq-strength 3 --qg-size 16 --limit-refs 1
 BasketballDrive_1920x1080_50.y4m,--preset slower --lossless --chromaloc 3 --subme 0 --limit-tu 4
-BasketballDrive_1920x1080_50.y4m,--preset slower --no-cutree --analysis-save x265_analysis.dat --analysis-save-reuse-level 10 --bitrate 7000 --limit-tu 0::--preset slower --no-cutree --analysis-load x265_analysis.dat --analysis-load-reuse-level 10 --bitrate 7000 --limit-tu 0
+BasketballDrive_1920x1080_50.y4m,--preset slower --analysis-save x265_analysis.dat --analysis-save-reuse-level 10 --bitrate 7000 --limit-tu 0::--preset slower --analysis-load x265_analysis.dat --analysis-load-reuse-level 10 --bitrate 7000 --limit-tu 0
 BasketballDrive_1920x1080_50.y4m,--preset veryslow --crf 4 --cu-lossless --pmode --limit-refs 1 --aq-mode 3 --limit-tu 3
-BasketballDrive_1920x1080_50.y4m,--preset veryslow --no-cutree --analysis-save x265_analysis.dat --analysis-save-reuse-level 5 --crf 18 --tskip-fast --limit-tu 2::--preset veryslow --no-cutree --analysis-load x265_analysis.dat  --analysis-load-reuse-level 5 --crf 18 --tskip-fast --limit-tu 2
+BasketballDrive_1920x1080_50.y4m,--preset veryslow --analysis-save x265_analysis.dat --analysis-save-reuse-level 5 --crf 18 --tskip-fast --limit-tu 2::--preset veryslow --analysis-load x265_analysis.dat  --analysis-load-reuse-level 5 --crf 18 --tskip-fast --limit-tu 2
 BasketballDrive_1920x1080_50.y4m,--preset veryslow --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
 Coastguard-4k.y4m,--preset ultrafast --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
 Coastguard-4k.y4m,--preset superfast --tune grain --overscan=crop
@@ -33,7 +33,7 @@
 Coastguard-4k.y4m,--preset slow --tune psnr --cbqpoffs -1 --crqpoffs 1 --limit-refs 1
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset ultrafast --weightp --tune zerolatency --qg-size 16
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset superfast --weightp --no-wpp --sao
-CrowdRun_1920x1080_50_10bit_422.yuv,--preset veryfast --temporal-layers --tune grain
+CrowdRun_1920x1080_50_10bit_422.yuv,--preset veryfast --temporal-layers 2 --tune grain
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset faster --max-tu-size 4 --min-cu-size 32
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset fast --aq-mode 0 --sar 2 --range full
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset medium --no-wpp --no-cutree --no-strong-intra-smoothing --limit-refs 1
@@ -41,7 +41,7 @@
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset slower --tune ssim --tune fastdecode --limit-refs 2
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset ultrafast --weightp --no-wpp --no-open-gop
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset superfast --weightp --dither --no-psy-rd
-CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryfast --temporal-layers --repeat-headers --limit-refs 2
+CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryfast --temporal-layers 2 --repeat-headers --limit-refs 2
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset medium --dither --keyint -1 --rdoq-level 1 --limit-modes
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryslow --tskip --tskip-fast --no-scenecut --limit-tu 1
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryslow --aq-mode 3 --aq-strength 1.5 --aq-motion --bitrate 5000
@@ -49,11 +49,11 @@
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryslow --hevc-aq --no-cutree --qg-size 16
 DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset superfast --weightp --qg-size 16
 DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset medium --tune psnr --bframes 16 --limit-modes
-DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset slow --temporal-layers --no-psy-rd --qg-size 32 --limit-refs 0 --cu-lossless
+DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset slow --temporal-layers 2 --no-psy-rd --qg-size 32 --limit-refs 0 --cu-lossless
 DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset veryfast --weightp --nr-intra 1000 -F4
 DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset medium --nr-inter 500 -F4 --no-psy-rdoq
 DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset slower --no-weightp --rdoq-level 0 --limit-refs 3 --tu-inter-depth 4 --limit-tu 3
-DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset fast --no-cutree --analysis-save x265_analysis.dat --analysis-save-reuse-level 5 --bitrate 3000 --early-skip --tu-inter-depth 3 --limit-tu 1::--preset fast --no-cutree --analysis-load x265_analysis.dat  --analysis-load-reuse-level 5 --bitrate 3000 --early-skip --tu-inter-depth 3 --limit-tu 1
+DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset fast --analysis-save x265_analysis.dat --analysis-save-reuse-level 5 --bitrate 3000 --early-skip --tu-inter-depth 3 --limit-tu 1::--preset fast --analysis-load x265_analysis.dat  --analysis-load-reuse-level 5 --bitrate 3000 --early-skip --tu-inter-depth 3 --limit-tu 1
 FourPeople_1280x720_60.y4m,--preset superfast --no-wpp --lookahead-slices 2
 FourPeople_1280x720_60.y4m,--preset veryfast --aq-mode 2 --aq-strength 1.5 --qg-size 8
 FourPeople_1280x720_60.y4m,--preset medium --qp 38 --no-psy-rd
@@ -158,13 +158,10 @@
 ducks_take_off_420_1_720p50.y4m,--preset medium --selective-sao 4 --sao --crf 20
 Traffic_4096x2048_30p.y4m, --preset medium --frame-dup --dup-threshold 60 --hrd --bitrate 10000 --vbv-bufsize 15000 --vbv-maxrate 12000
 Kimono1_1920x1080_24_400.yuv,--preset superfast --qp 28 --zones 0,139,q=32
-sintel_trailer_2k_1920x1080_24.yuv, --preset medium --hist-scenecut --hist-threshold 0.02 --frame-dup --dup-threshold 60 --hrd --bitrate 10000 --vbv-bufsize 15000 --vbv-maxrate 12000
-sintel_trailer_2k_1920x1080_24.yuv, --preset medium --hist-scenecut --hist-threshold 0.02
-sintel_trailer_2k_1920x1080_24.yuv, --preset ultrafast --hist-scenecut --hist-threshold 0.02
 crowd_run_1920x1080_50.yuv, --preset faster --ctu 32 --rskip 2 --rskip-edge-threshold 5
 crowd_run_1920x1080_50.yuv, --preset fast --ctu 64 --rskip 2 --rskip-edge-threshold 5 --aq-mode 4
-crowd_run_1920x1080_50.yuv, --preset slow --ctu 32 --rskip 2 --rskip-edge-threshold 5 --hist-scenecut --hist-threshold 0.1
-crowd_run_1920x1080_50.yuv, --preset slower --ctu 16 --rskip 2 --rskip-edge-threshold 5 --hist-scenecut --hist-threshold 0.1 --aq-mode 4
+crowd_run_1920x1080_50.yuv, --preset ultrafast --video-signal-type-preset BT2100_PQ_YCC:BT2100x108n0005
+crowd_run_1920x1080_50.yuv, --preset ultrafast --eob --eos
  
 # Main12 intraCost overflow bug test
 720p50_parkrun_ter.y4m,--preset medium
@@ -182,14 +179,22 @@
 
 #scaled save/load test
 crowd_run_1080p50.y4m,--preset ultrafast --no-cutree --analysis-save x265_analysis.dat  --analysis-save-reuse-level 1 --scale-factor 2 --crf 26 --vbv-maxrate 8000 --vbv-bufsize 8000::crowd_run_2160p50.y4m, --preset ultrafast --no-cutree --analysis-load x265_analysis.dat  --analysis-load-reuse-level 1 --scale-factor 2 --crf 26 --vbv-maxrate 12000 --vbv-bufsize 12000 
-crowd_run_1080p50.y4m,--preset superfast --no-cutree --analysis-save x265_analysis.dat  --analysis-save-reuse-level 2 --scale-factor 2 --crf 22 --vbv-maxrate 5000 --vbv-bufsize 5000::crowd_run_2160p50.y4m, --preset superfast --no-cutree --analysis-load x265_analysis.dat  --analysis-load-reuse-level 2 --scale-factor 2 --crf 22 --vbv-maxrate 10000 --vbv-bufsize 10000 
-crowd_run_1080p50.y4m,--preset fast --no-cutree --analysis-save x265_analysis.dat  --analysis-save-reuse-level 5 --scale-factor 2 --qp 18::crowd_run_2160p50.y4m, --preset fast --no-cutree --analysis-load x265_analysis.dat  --analysis-load-reuse-level 5 --scale-factor 2 --qp 18
+crowd_run_1080p50.y4m,--preset superfast --analysis-save x265_analysis.dat  --analysis-save-reuse-level 2 --scale-factor 2 --crf 22 --vbv-maxrate 5000 --vbv-bufsize 5000::crowd_run_2160p50.y4m, --preset superfast --analysis-load x265_analysis.dat  --analysis-load-reuse-level 2 --scale-factor 2 --crf 22 --vbv-maxrate 10000 --vbv-bufsize 10000 
+crowd_run_1080p50.y4m,--preset fast --analysis-save x265_analysis.dat  --analysis-save-reuse-level 5 --scale-factor 2 --qp 18::crowd_run_2160p50.y4m, --preset fast --analysis-load x265_analysis.dat  --analysis-load-reuse-level 5 --scale-factor 2 --qp 18
 crowd_run_1080p50.y4m,--preset medium --no-cutree --analysis-save x265_analysis.dat  --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 5000  --vbv-maxrate 5000 --vbv-bufsize 5000 --early-skip --tu-inter-depth 3::crowd_run_2160p50.y4m, --preset medium --no-cutree --analysis-load x265_analysis.dat  --analysis-load-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-maxrate 10000 --vbv-bufsize 10000 --early-skip --tu-inter-depth 3 --refine-intra 4 --dynamic-refine::crowd_run_2160p50.y4m, --preset medium --no-cutree --analysis-load x265_analysis.dat  --analysis-load-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-maxrate 10000 --vbv-bufsize 10000 --early-skip --tu-inter-depth 3 --refine-intra 3 --refine-inter 3
-RaceHorses_416x240_30.y4m,--preset slow --no-cutree --ctu 16 --analysis-save x265_analysis.dat --analysis-save-reuse-level 10 --scale-factor 2 --crf 22  --vbv-maxrate 1000 --vbv-bufsize 1000::RaceHorses_832x480_30.y4m, --preset slow --no-cutree --ctu 32 --analysis-load x265_analysis.dat  --analysis-save x265_analysis_2.dat --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --crf 16 --vbv-maxrate 4000 --vbv-bufsize 4000 --refine-intra 0 --refine-inter 1::RaceHorses_1664x960_30.y4m,--preset slow --no-cutree --ctu 64 --analysis-load x265_analysis_2.dat  --analysis-load-reuse-level 10 --scale-factor 2 --crf 12 --vbv-maxrate 7000 --vbv-bufsize 7000 --refine-intra 2 --refine-inter 2
+RaceHorses_416x240_30.y4m,--preset slow --ctu 16 --analysis-save x265_analysis.dat --analysis-save-reuse-level 10 --scale-factor 2 --crf 22  --vbv-maxrate 1000 --vbv-bufsize 1000::RaceHorses_832x480_30.y4m, --preset slow --ctu 32 --analysis-load x265_analysis.dat  --analysis-save x265_analysis_2.dat --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --crf 16 --vbv-maxrate 4000 --vbv-bufsize 4000 --refine-intra 0 --refine-inter 1::RaceHorses_1664x960_30.y4m,--preset slow --ctu 64 --analysis-load x265_analysis_2.dat  --analysis-load-reuse-level 10 --scale-factor 2 --crf 12 --vbv-maxrate 7000 --vbv-bufsize 7000 --refine-intra 2 --refine-inter 2
 ElFunete_960x540_60.yuv,--colorprim bt709 --transfer bt709 --chromaloc 2 --aud --repeat-headers --no-opt-qp-pps --no-opt-ref-list-length-pps --wpp --no-interlace --sar 1:1 --min-keyint 60 --no-open-gop --rc-lookahead 180 --bframes 5 --b-intra --ref 4 --cbqpoffs -2 --crqpoffs -2 --lookahead-threads 0 --weightb --qg-size 8 --me star --preset veryslow --frame-threads 1 --b-adapt 2 --aq-mode 3 --rd 6 --pools 15 --colormatrix bt709 --keyint 120 --high-tier --ctu 64 --tune psnr --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500 --analysis-save-reuse-level 10 --analysis-save elfuente_960x540.dat --scale-factor 2::ElFunete_1920x1080_60.yuv,--colorprim bt709 --transfer bt709 --chromaloc 2 --aud --repeat-headers --no-opt-qp-pps --no-opt-ref-list-length-pps --wpp --no-interlace --sar 1:1 --min-keyint 60 --no-open-gop --rc-lookahead 180 --bframes 5 --b-intra --ref 4 --cbqpoffs -2 --crqpoffs -2 --lookahead-threads 0 --weightb --qg-size 8 --me star --preset veryslow --frame-threads 1 --b-adapt 2 --aq-mode 3 --rd 6 --pools 15 --colormatrix bt709 --keyint 120 --high-tier --ctu 64 --tune psnr --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500 --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --analysis-save elfuente_1920x1080.dat --limit-tu 0 --scale-factor 2 --analysis-load elfuente_960x540.dat --refine-intra 4 --refine-inter 2::ElFuente_3840x2160_60.yuv,--colorprim bt709 --transfer bt709 --chromaloc 2 --aud --repeat-headers --no-opt-qp-pps --no-opt-ref-list-length-pps --wpp --no-interlace --sar 1:1 --min-keyint 60 --no-open-gop --rc-lookahead 180 --bframes 5 --b-intra --ref 4 --cbqpoffs -2 --crqpoffs -2 --lookahead-threads 0 --weightb --qg-size 8 --me star --preset veryslow --frame-threads 1 --b-adapt 2 --aq-mode 3 --rd 6 --pools 15 --colormatrix bt709 --keyint 120 --high-tier --ctu 64 --tune=psnr --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000 --analysis-load-reuse-level 10 --limit-tu 0 --scale-factor 2 --analysis-load elfuente_1920x1080.dat --refine-intra 4 --refine-inter 2
 #save/load with ctu distortion refinement
 CrowdRun_1920x1080_50_10bit_422.yuv,--no-cutree --analysis-save x265_analysis.dat --analysis-save-reuse-level 5 --refine-ctu-distortion 1 --bitrate 7000::--no-cutree --analysis-load x265_analysis.dat --refine-ctu-distortion 1 --bitrate 7000 --analysis-load-reuse-level 5
 #segment encoding
 BasketballDrive_1920x1080_50.y4m, --preset ultrafast --no-open-gop --chunk-start 100 --chunk-end 200
 
+#Test FG SEI message addition
+#OldTownCross_1920x1080_50_10bit_422.yuv,--preset slower --tune grain --film-grain "OldTownCross_1920x1080_50_10bit_422.bin"
+#RaceHorses_416x240_30_10bit.yuv,--preset ultrafast --signhide --colormatrix bt709 --film-grain "RaceHorses_416x240_30_10bit.bin"
+
+#Temporal layers tests
+ducks_take_off_420_720p50.y4m,--preset slow --temporal-layers 3 --b-adapt 0
+parkrun_ter_720p50.y4m,--preset medium --temporal-layers 4 --b-adapt 0
+BasketballDrive_1920x1080_50.y4m, --preset medium --no-open-gop --keyint 50 --min-keyint 50 --temporal-layers 5 --b-adapt 0
 # vim: tw=200
​

x265_3.5.tar.gz/source/test/save-load-tests.txt -> x265_3.6.tar.gz/source/test/save-load-tests.txt Changed

@@ -12,10 +12,10 @@
 # not auto-detected.
 crowd_run_1080p50.y4m, --preset ultrafast --no-cutree --analysis-save x265_analysis.dat  --analysis-save-reuse-level 1 --scale-factor 2 --crf 26 --vbv-maxrate 8000 --vbv-bufsize 8000::crowd_run_2160p50.y4m, --preset ultrafast --no-cutree --analysis-load x265_analysis.dat  --analysis-load-reuse-level 1 --scale-factor 2 --crf 26 --vbv-maxrate 12000 --vbv-bufsize 12000
 crowd_run_540p50.y4m, --preset ultrafast --no-cutree --analysis-save x265_analysis.dat --scale-factor 2 --crf 26 --vbv-maxrate 8000 --vbv-bufsize 8000::crowd_run_1080p50.y4m, --preset ultrafast --no-cutree --analysis-load x265_analysis.dat --scale-factor 2 --crf 26 --vbv-maxrate 12000 --vbv-bufsize 12000
-crowd_run_1080p50.y4m, --preset superfast --no-cutree --analysis-save x265_analysis.dat  --analysis-save-reuse-level 2 --scale-factor 2 --crf 22 --vbv-maxrate 5000 --vbv-bufsize 5000::crowd_run_2160p50.y4m,   --preset superfast --no-cutree --analysis-load x265_analysis.dat  --analysis-load-reuse-level 2 --scale-factor 2 --crf 22 --vbv-maxrate 10000 --vbv-bufsize 10000
-crowd_run_1080p50.y4m,  --preset fast --no-cutree --analysis-save x265_analysis.dat  --analysis-save-reuse-level 5 --scale-factor 2 --qp 18::crowd_run_2160p50.y4m,   --preset fast --no-cutree --analysis-load x265_analysis.dat  --analysis-load-reuse-level 5 --scale-factor 2 --qp 18
-crowd_run_1080p50.y4m,   --preset medium --no-cutree --analysis-save x265_analysis.dat  --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 5000  --vbv-maxrate 5000 --vbv-bufsize 5000 --early-skip --tu-inter-depth 3::crowd_run_2160p50.y4m,    --preset medium --no-cutree --analysis-load x265_analysis.dat  --analysis-load-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-maxrate 10000 --vbv-bufsize 10000 --early-skip --tu-inter-depth 3 --refine-intra 4 --dynamic-refine::crowd_run_2160p50.y4m,    --preset medium --no-cutree --analysis-load x265_analysis.dat  --analysis-load-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-maxrate 10000 --vbv-bufsize 10000 --early-skip --tu-inter-depth 3 --refine-intra 3 --refine-inter 3
+crowd_run_1080p50.y4m, --preset superfast --analysis-save x265_analysis.dat  --analysis-save-reuse-level 2 --scale-factor 2 --crf 22 --vbv-maxrate 5000 --vbv-bufsize 5000::crowd_run_2160p50.y4m,   --preset superfast --analysis-load x265_analysis.dat  --analysis-load-reuse-level 2 --scale-factor 2 --crf 22 --vbv-maxrate 10000 --vbv-bufsize 10000
+crowd_run_1080p50.y4m,  --preset fast --analysis-save x265_analysis.dat  --analysis-save-reuse-level 5 --scale-factor 2 --qp 18::crowd_run_2160p50.y4m,   --preset fast --analysis-load x265_analysis.dat  --analysis-load-reuse-level 5 --scale-factor 2 --qp 18
+crowd_run_1080p50.y4m,   --preset medium --analysis-save x265_analysis.dat  --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 5000  --vbv-maxrate 5000 --vbv-bufsize 5000 --early-skip --tu-inter-depth 3::crowd_run_2160p50.y4m,    --preset medium --analysis-load x265_analysis.dat  --analysis-load-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-maxrate 10000 --vbv-bufsize 10000 --early-skip --tu-inter-depth 3 --refine-intra 4 --dynamic-refine::crowd_run_2160p50.y4m,    --preset medium --analysis-load x265_analysis.dat  --analysis-load-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-maxrate 10000 --vbv-bufsize 10000 --early-skip --tu-inter-depth 3 --refine-intra 3 --refine-inter 3
 RaceHorses_416x240_30.y4m,   --preset slow --no-cutree --ctu 16 --analysis-save x265_analysis.dat --analysis-save-reuse-level 10 --scale-factor 2 --crf 22  --vbv-maxrate 1000 --vbv-bufsize 1000::RaceHorses_832x480_30.y4m,    --preset slow --no-cutree --ctu 32 --analysis-load x265_analysis.dat  --analysis-save x265_analysis_2.dat --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --crf 16 --vbv-maxrate 4000 --vbv-bufsize 4000 --refine-intra 0 --refine-inter 1::RaceHorses_1664x960_30.y4m,   --preset slow --no-cutree --ctu 64 --analysis-load x265_analysis_2.dat  --analysis-load-reuse-level 10 --scale-factor 2 --crf 12 --vbv-maxrate 7000 --vbv-bufsize 7000 --refine-intra 2 --refine-inter 2
-crowd_run_540p50.y4m,   --preset veryslow --no-cutree --analysis-save x265_analysis_540.dat  --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 5000 --vbv-bufsize 15000 --vbv-maxrate 9000::crowd_run_1080p50.y4m,   --preset veryslow --no-cutree --analysis-save x265_analysis_1080.dat  --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500::crowd_run_1080p50.y4m,  --preset veryslow --no-cutree --analysis-save x265_analysis_1080.dat --analysis-load x265_analysis_540.dat --refine-intra 4 --dynamic-refine --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500::crowd_run_2160p50.y4m,  --preset veryslow --no-cutree --analysis-save x265_analysis_2160.dat --analysis-load x265_analysis_1080.dat --refine-intra 3 --dynamic-refine --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000::crowd_run_2160p50.y4m,  --preset veryslow --no-cutree --analysis-load x265_analysis_2160.dat --refine-intra 2 --dynamic-refine --analysis-load-reuse-level 10 --scale-factor 1 --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000
+crowd_run_540p50.y4m,   --preset veryslow --analysis-save x265_analysis_540.dat  --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 5000 --vbv-bufsize 15000 --vbv-maxrate 9000::crowd_run_1080p50.y4m,   --preset veryslow --analysis-save x265_analysis_1080.dat  --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500::crowd_run_1080p50.y4m,  --preset veryslow --analysis-save x265_analysis_1080.dat --analysis-load x265_analysis_540.dat --refine-intra 4 --dynamic-refine --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500::crowd_run_2160p50.y4m,  --preset veryslow --analysis-save x265_analysis_2160.dat --analysis-load x265_analysis_1080.dat --refine-intra 3 --dynamic-refine --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000::crowd_run_2160p50.y4m,  --preset veryslow --analysis-load x265_analysis_2160.dat --refine-intra 2 --dynamic-refine --analysis-load-reuse-level 10 --scale-factor 1 --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000
 crowd_run_540p50.y4m,  --preset medium --no-cutree --analysis-save x265_analysis_540.dat  --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 5000 --vbv-bufsize 15000 --vbv-maxrate 9000::crowd_run_1080p50.y4m,  --preset medium --no-cutree --analysis-save x265_analysis_1080.dat  --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500::crowd_run_1080p50.y4m,  --preset medium --no-cutree --analysis-save x265_analysis_1080.dat --analysis-load x265_analysis_540.dat --refine-intra 4 --dynamic-refine --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500::crowd_run_2160p50.y4m,  --preset medium --no-cutree --analysis-save x265_analysis_2160.dat --analysis-load x265_analysis_1080.dat --refine-intra 3 --dynamic-refine --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000::crowd_run_2160p50.y4m,  --preset medium --no-cutree --analysis-load x265_analysis_2160.dat --refine-intra 2 --dynamic-refine --analysis-load-reuse-level 10 --scale-factor 1 --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000
 News-4k.y4m,  --preset medium --analysis-save x265_analysis_fdup.dat --frame-dup --hrd --bitrate 10000 --vbv-bufsize 15000 --vbv-maxrate 12000::News-4k.y4m, --analysis-load x265_analysis_fdup.dat --frame-dup --hrd --bitrate 10000 --vbv-bufsize 15000 --vbv-maxrate 12000

 
@@ -12,10 +12,10 @@
 # not auto-detected.
 crowd_run_1080p50.y4m, --preset ultrafast --no-cutree --analysis-save x265_analysis.dat  --analysis-save-reuse-level 1 --scale-factor 2 --crf 26 --vbv-maxrate 8000 --vbv-bufsize 8000::crowd_run_2160p50.y4m, --preset ultrafast --no-cutree --analysis-load x265_analysis.dat  --analysis-load-reuse-level 1 --scale-factor 2 --crf 26 --vbv-maxrate 12000 --vbv-bufsize 12000
 crowd_run_540p50.y4m, --preset ultrafast --no-cutree --analysis-save x265_analysis.dat --scale-factor 2 --crf 26 --vbv-maxrate 8000 --vbv-bufsize 8000::crowd_run_1080p50.y4m, --preset ultrafast --no-cutree --analysis-load x265_analysis.dat --scale-factor 2 --crf 26 --vbv-maxrate 12000 --vbv-bufsize 12000
-crowd_run_1080p50.y4m, --preset superfast --no-cutree --analysis-save x265_analysis.dat  --analysis-save-reuse-level 2 --scale-factor 2 --crf 22 --vbv-maxrate 5000 --vbv-bufsize 5000::crowd_run_2160p50.y4m,   --preset superfast --no-cutree --analysis-load x265_analysis.dat  --analysis-load-reuse-level 2 --scale-factor 2 --crf 22 --vbv-maxrate 10000 --vbv-bufsize 10000
-crowd_run_1080p50.y4m,  --preset fast --no-cutree --analysis-save x265_analysis.dat  --analysis-save-reuse-level 5 --scale-factor 2 --qp 18::crowd_run_2160p50.y4m,   --preset fast --no-cutree --analysis-load x265_analysis.dat  --analysis-load-reuse-level 5 --scale-factor 2 --qp 18
-crowd_run_1080p50.y4m,   --preset medium --no-cutree --analysis-save x265_analysis.dat  --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 5000  --vbv-maxrate 5000 --vbv-bufsize 5000 --early-skip --tu-inter-depth 3::crowd_run_2160p50.y4m,    --preset medium --no-cutree --analysis-load x265_analysis.dat  --analysis-load-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-maxrate 10000 --vbv-bufsize 10000 --early-skip --tu-inter-depth 3 --refine-intra 4 --dynamic-refine::crowd_run_2160p50.y4m,    --preset medium --no-cutree --analysis-load x265_analysis.dat  --analysis-load-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-maxrate 10000 --vbv-bufsize 10000 --early-skip --tu-inter-depth 3 --refine-intra 3 --refine-inter 3
+crowd_run_1080p50.y4m, --preset superfast --analysis-save x265_analysis.dat  --analysis-save-reuse-level 2 --scale-factor 2 --crf 22 --vbv-maxrate 5000 --vbv-bufsize 5000::crowd_run_2160p50.y4m,   --preset superfast --analysis-load x265_analysis.dat  --analysis-load-reuse-level 2 --scale-factor 2 --crf 22 --vbv-maxrate 10000 --vbv-bufsize 10000
+crowd_run_1080p50.y4m,  --preset fast --analysis-save x265_analysis.dat  --analysis-save-reuse-level 5 --scale-factor 2 --qp 18::crowd_run_2160p50.y4m,   --preset fast --analysis-load x265_analysis.dat  --analysis-load-reuse-level 5 --scale-factor 2 --qp 18
+crowd_run_1080p50.y4m,   --preset medium --analysis-save x265_analysis.dat  --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 5000  --vbv-maxrate 5000 --vbv-bufsize 5000 --early-skip --tu-inter-depth 3::crowd_run_2160p50.y4m,    --preset medium --analysis-load x265_analysis.dat  --analysis-load-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-maxrate 10000 --vbv-bufsize 10000 --early-skip --tu-inter-depth 3 --refine-intra 4 --dynamic-refine::crowd_run_2160p50.y4m,    --preset medium --analysis-load x265_analysis.dat  --analysis-load-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-maxrate 10000 --vbv-bufsize 10000 --early-skip --tu-inter-depth 3 --refine-intra 3 --refine-inter 3
 RaceHorses_416x240_30.y4m,   --preset slow --no-cutree --ctu 16 --analysis-save x265_analysis.dat --analysis-save-reuse-level 10 --scale-factor 2 --crf 22  --vbv-maxrate 1000 --vbv-bufsize 1000::RaceHorses_832x480_30.y4m,    --preset slow --no-cutree --ctu 32 --analysis-load x265_analysis.dat  --analysis-save x265_analysis_2.dat --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --crf 16 --vbv-maxrate 4000 --vbv-bufsize 4000 --refine-intra 0 --refine-inter 1::RaceHorses_1664x960_30.y4m,   --preset slow --no-cutree --ctu 64 --analysis-load x265_analysis_2.dat  --analysis-load-reuse-level 10 --scale-factor 2 --crf 12 --vbv-maxrate 7000 --vbv-bufsize 7000 --refine-intra 2 --refine-inter 2
-crowd_run_540p50.y4m,   --preset veryslow --no-cutree --analysis-save x265_analysis_540.dat  --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 5000 --vbv-bufsize 15000 --vbv-maxrate 9000::crowd_run_1080p50.y4m,   --preset veryslow --no-cutree --analysis-save x265_analysis_1080.dat  --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500::crowd_run_1080p50.y4m,  --preset veryslow --no-cutree --analysis-save x265_analysis_1080.dat --analysis-load x265_analysis_540.dat --refine-intra 4 --dynamic-refine --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500::crowd_run_2160p50.y4m,  --preset veryslow --no-cutree --analysis-save x265_analysis_2160.dat --analysis-load x265_analysis_1080.dat --refine-intra 3 --dynamic-refine --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000::crowd_run_2160p50.y4m,  --preset veryslow --no-cutree --analysis-load x265_analysis_2160.dat --refine-intra 2 --dynamic-refine --analysis-load-reuse-level 10 --scale-factor 1 --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000
+crowd_run_540p50.y4m,   --preset veryslow --analysis-save x265_analysis_540.dat  --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 5000 --vbv-bufsize 15000 --vbv-maxrate 9000::crowd_run_1080p50.y4m,   --preset veryslow --analysis-save x265_analysis_1080.dat  --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500::crowd_run_1080p50.y4m,  --preset veryslow --analysis-save x265_analysis_1080.dat --analysis-load x265_analysis_540.dat --refine-intra 4 --dynamic-refine --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500::crowd_run_2160p50.y4m,  --preset veryslow --analysis-save x265_analysis_2160.dat --analysis-load x265_analysis_1080.dat --refine-intra 3 --dynamic-refine --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000::crowd_run_2160p50.y4m,  --preset veryslow --analysis-load x265_analysis_2160.dat --refine-intra 2 --dynamic-refine --analysis-load-reuse-level 10 --scale-factor 1 --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000
 crowd_run_540p50.y4m,  --preset medium --no-cutree --analysis-save x265_analysis_540.dat  --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 5000 --vbv-bufsize 15000 --vbv-maxrate 9000::crowd_run_1080p50.y4m,  --preset medium --no-cutree --analysis-save x265_analysis_1080.dat  --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500::crowd_run_1080p50.y4m,  --preset medium --no-cutree --analysis-save x265_analysis_1080.dat --analysis-load x265_analysis_540.dat --refine-intra 4 --dynamic-refine --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500::crowd_run_2160p50.y4m,  --preset medium --no-cutree --analysis-save x265_analysis_2160.dat --analysis-load x265_analysis_1080.dat --refine-intra 3 --dynamic-refine --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000::crowd_run_2160p50.y4m,  --preset medium --no-cutree --analysis-load x265_analysis_2160.dat --refine-intra 2 --dynamic-refine --analysis-load-reuse-level 10 --scale-factor 1 --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000
 News-4k.y4m,  --preset medium --analysis-save x265_analysis_fdup.dat --frame-dup --hrd --bitrate 10000 --vbv-bufsize 15000 --vbv-maxrate 12000::News-4k.y4m, --analysis-load x265_analysis_fdup.dat --frame-dup --hrd --bitrate 10000 --vbv-bufsize 15000 --vbv-maxrate 12000
​

x265_3.5.tar.gz/source/test/smoke-tests.txt -> x265_3.6.tar.gz/source/test/smoke-tests.txt Changed

 
@@ -23,3 +23,7 @@
 # Main12 intraCost overflow bug test
 720p50_parkrun_ter.y4m,--preset medium
 720p50_parkrun_ter.y4m,--preset=fast --hevc-aq --no-cutree
+# Test FG SEI message addition
+# CrowdRun_1920x1080_50_10bit_444.yuv,--preset=ultrafast --weightp --keyint -1 --film-grain "CrowdRun_1920x1080_50_10bit_444.bin"
+# DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset=veryfast --min-cu 16 --film-grain "DucksAndLegs_1920x1080_60_10bit_422.bin"
+# NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset=superfast --bitrate 10000 --sao --limit-sao --cll --max-cll "1000,400" --film-grain "NebutaFestival_2560x1600_60_10bit_crop.bin"
​

x265_3.5.tar.gz/source/test/testbench.cpp -> x265_3.6.tar.gz/source/test/testbench.cpp Changed

@@ -174,6 +174,8 @@
         { "AVX512", X265_CPU_AVX512 },
         { "ARMv6", X265_CPU_ARMV6 },
         { "NEON", X265_CPU_NEON },
+        { "SVE2", X265_CPU_SVE2 },
+        { "SVE", X265_CPU_SVE },
         { "FastNeonMRC", X265_CPU_FAST_NEON_MRC },
         { "", 0 },
     };
@@ -208,15 +210,8 @@
 
         EncoderPrimitives asmprim;
         memset(&asmprim, 0, sizeof(asmprim));
-        setupAssemblyPrimitives(asmprim, test_archi.flag);
-
-#if X265_ARCH_ARM64
-        /* Temporary workaround because luma_vsp assembly primitive has not been completed
-         * but interp_8tap_hv_pp_cpu uses mixed C primitive and assembly primitive.
-         * Otherwise, segment fault occurs. */
-        setupAliasCPrimitives(cprim, asmprim, test_archi.flag);
-#endif
 
+        setupAssemblyPrimitives(asmprim, test_archi.flag);
         setupAliasPrimitives(asmprim);
         memcpy(&primitives, &asmprim, sizeof(EncoderPrimitives));
         for (size_t h = 0; h < sizeof(harness) / sizeof(TestHarness*); h++)
@@ -239,14 +234,8 @@
 #if X265_ARCH_X86
     setupInstrinsicPrimitives(optprim, cpuid);
 #endif
-    setupAssemblyPrimitives(optprim, cpuid);
 
-#if X265_ARCH_ARM64
-    /* Temporary workaround because luma_vsp assembly primitive has not been completed
-     * but interp_8tap_hv_pp_cpu uses mixed C primitive and assembly primitive.
-     * Otherwise, segment fault occurs. */
-    setupAliasCPrimitives(cprim, optprim, cpuid);
-#endif
+    setupAssemblyPrimitives(optprim, cpuid);
 
     /* Note that we do not setup aliases for performance tests, that would be
      * redundant. The testbench only verifies they are correctly aliased */

 
@@ -174,6 +174,8 @@
         { "AVX512", X265_CPU_AVX512 },
         { "ARMv6", X265_CPU_ARMV6 },
         { "NEON", X265_CPU_NEON },
+        { "SVE2", X265_CPU_SVE2 },
+        { "SVE", X265_CPU_SVE },
         { "FastNeonMRC", X265_CPU_FAST_NEON_MRC },
         { "", 0 },
     };
@@ -208,15 +210,8 @@
 
         EncoderPrimitives asmprim;
         memset(&asmprim, 0, sizeof(asmprim));
-        setupAssemblyPrimitives(asmprim, test_archi.flag);
-
-#if X265_ARCH_ARM64
-        /* Temporary workaround because luma_vsp assembly primitive has not been completed
-         * but interp_8tap_hv_pp_cpu uses mixed C primitive and assembly primitive.
-         * Otherwise, segment fault occurs. */
-        setupAliasCPrimitives(cprim, asmprim, test_archi.flag);
-#endif
 
+        setupAssemblyPrimitives(asmprim, test_archi.flag);
         setupAliasPrimitives(asmprim);
         memcpy(&primitives, &asmprim, sizeof(EncoderPrimitives));
         for (size_t h = 0; h < sizeof(harness) / sizeof(TestHarness*); h++)
@@ -239,14 +234,8 @@
 #if X265_ARCH_X86
     setupInstrinsicPrimitives(optprim, cpuid);
 #endif
-    setupAssemblyPrimitives(optprim, cpuid);
 
-#if X265_ARCH_ARM64
-    /* Temporary workaround because luma_vsp assembly primitive has not been completed
-     * but interp_8tap_hv_pp_cpu uses mixed C primitive and assembly primitive.
-     * Otherwise, segment fault occurs. */
-    setupAliasCPrimitives(cprim, optprim, cpuid);
-#endif
+    setupAssemblyPrimitives(optprim, cpuid);
 
     /* Note that we do not setup aliases for performance tests, that would be
      * redundant. The testbench only verifies they are correctly aliased */
​

x265_3.5.tar.gz/source/test/testharness.h -> x265_3.6.tar.gz/source/test/testharness.h Changed

@@ -73,7 +73,7 @@
 #include <x86intrin.h>
 #elif ( !defined(__APPLE__) && defined (__GNUC__) && defined(__ARM_NEON__))
 #include <arm_neon.h>
-#elif defined(__GNUC__) && (!defined(__clang__) || __clang_major__ < 4)
+#else
 /* fallback for older GCC/MinGW */
 static inline uint32_t __rdtsc(void)
 {
@@ -82,15 +82,13 @@
 #if X265_ARCH_X86
     asm volatile("rdtsc" : "=a" (a) ::"edx");
 #elif X265_ARCH_ARM
-#if X265_ARCH_ARM64
-    asm volatile("mrs %0, cntvct_el0" : "=r"(a));
-#else
     // TOD-DO: verify following inline asm to get cpu Timestamp Counter for ARM arch
     // asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(a));
 
     // TO-DO: replace clock() function with appropriate ARM cpu instructions
     a = clock();
-#endif
+#elif  X265_ARCH_ARM64
+    asm volatile("mrs %0, cntvct_el0" : "=r"(a));
 #endif
     return a;
 }
@@ -128,8 +126,8 @@
         x265_emms(); \
         float optperf = (10.0f * cycles / runs) / 4; \
         float refperf = (10.0f * refcycles / refruns) / 4; \
-        printf("\t%3.2fx ", refperf / optperf); \
-        printf("\t %-8.2lf \t %-8.2lf\n", optperf, refperf); \
+        printf(" | \t%3.2fx | ", refperf / optperf); \
+        printf("\t %-8.2lf | \t %-8.2lf\n", optperf, refperf); \
     }
 
 extern "C" {
@@ -140,7 +138,7 @@
  * needs an explicit asm check because it only sometimes crashes in normal use. */
 intptr_t PFX(checkasm_call)(intptr_t (*func)(), int *ok, ...);
 float PFX(checkasm_call_float)(float (*func)(), int *ok, ...);
-#elif X265_ARCH_ARM == 0
+#elif (X265_ARCH_ARM == 0 && X265_ARCH_ARM64 == 0)
 #define PFX(stack_pagealign)(func, align) func()
 #endif

 
@@ -73,7 +73,7 @@
 #include <x86intrin.h>
 #elif ( !defined(__APPLE__) && defined (__GNUC__) && defined(__ARM_NEON__))
 #include <arm_neon.h>
-#elif defined(__GNUC__) && (!defined(__clang__) || __clang_major__ < 4)
+#else
 /* fallback for older GCC/MinGW */
 static inline uint32_t __rdtsc(void)
 {
@@ -82,15 +82,13 @@
 #if X265_ARCH_X86
     asm volatile("rdtsc" : "=a" (a) ::"edx");
 #elif X265_ARCH_ARM
-#if X265_ARCH_ARM64
-    asm volatile("mrs %0, cntvct_el0" : "=r"(a));
-#else
     // TOD-DO: verify following inline asm to get cpu Timestamp Counter for ARM arch
     // asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(a));
 
     // TO-DO: replace clock() function with appropriate ARM cpu instructions
     a = clock();
-#endif
+#elif  X265_ARCH_ARM64
+    asm volatile("mrs %0, cntvct_el0" : "=r"(a));
 #endif
     return a;
 }
@@ -128,8 +126,8 @@
         x265_emms(); \
         float optperf = (10.0f * cycles / runs) / 4; \
         float refperf = (10.0f * refcycles / refruns) / 4; \
-        printf("\t%3.2fx ", refperf / optperf); \
-        printf("\t %-8.2lf \t %-8.2lf\n", optperf, refperf); \
+        printf(" | \t%3.2fx | ", refperf / optperf); \
+        printf("\t %-8.2lf | \t %-8.2lf\n", optperf, refperf); \
     }
 
 extern "C" {
@@ -140,7 +138,7 @@
  * needs an explicit asm check because it only sometimes crashes in normal use. */
 intptr_t PFX(checkasm_call)(intptr_t (*func)(), int *ok, ...);
 float PFX(checkasm_call_float)(float (*func)(), int *ok, ...);
-#elif X265_ARCH_ARM == 0
+#elif (X265_ARCH_ARM == 0 && X265_ARCH_ARM64 == 0)
 #define PFX(stack_pagealign)(func, align) func()
 #endif
 
​

x265_3.5.tar.gz/source/x265.cpp -> x265_3.6.tar.gz/source/x265.cpp Changed

 
@@ -296,6 +296,16 @@
 
     int ret = 0;
 
+    if (cliopt0.scenecutAwareQpConfig)
+    {
+        if (!cliopt0.parseScenecutAwareQpConfig())
+        {
+            x265_log(NULL, X265_LOG_ERROR, "Unable to parse scenecut aware qp config file \n");
+            fclose(cliopt0.scenecutAwareQpConfig);
+            cliopt0.scenecutAwareQpConfig = NULL;
+        }
+    }
+
     AbrEncoder* abrEnc = new AbrEncoder(cliopt, numEncodes, ret);
     int threadsActive = abrEnc->m_numActiveEncodes.get();
     while (threadsActive)
​

x265_3.5.tar.gz/source/x265.h -> x265_3.6.tar.gz/source/x265.h Changed

@@ -26,6 +26,7 @@
 #define X265_H
 #include <stdint.h>
 #include <stdio.h>
+#include <sys/stat.h>
 #include "x265_config.h"
 #ifdef __cplusplus
 extern "C" {
@@ -59,7 +60,7 @@
     NAL_UNIT_CODED_SLICE_TRAIL_N = 0,
     NAL_UNIT_CODED_SLICE_TRAIL_R,
     NAL_UNIT_CODED_SLICE_TSA_N,
-    NAL_UNIT_CODED_SLICE_TLA_R,
+    NAL_UNIT_CODED_SLICE_TSA_R,
     NAL_UNIT_CODED_SLICE_STSA_N,
     NAL_UNIT_CODED_SLICE_STSA_R,
     NAL_UNIT_CODED_SLICE_RADL_N,
@@ -311,6 +312,7 @@
     double           vmafFrameScore;
     double           bufferFillFinal;
     double           unclippedBufferFillFinal;
+    uint8_t          tLayer;
 } x265_frame_stats;
 
 typedef struct x265_ctu_info_t
@@ -536,6 +538,8 @@
 /* ARM */
 #define X265_CPU_ARMV6           0x0000001
 #define X265_CPU_NEON            0x0000002  /* ARM NEON */
+#define X265_CPU_SVE2            0x0000008  /* ARM SVE2 */
+#define X265_CPU_SVE             0x0000010  /* ARM SVE2 */
 #define X265_CPU_FAST_NEON_MRC   0x0000004  /* Transfer from NEON to ARM register is fast (Cortex-A9) */
 
 /* IBM Power8 */
@@ -613,6 +617,13 @@
 #define SLICE_TYPE_DELTA        0.3 /* The offset decremented or incremented for P-frames or b-frames respectively*/
 #define BACKWARD_WINDOW         1 /* Scenecut window before a scenecut */
 #define FORWARD_WINDOW          2 /* Scenecut window after a scenecut */
+#define BWD_WINDOW_DELTA        0.4
+
+#define X265_MAX_GOP_CONFIG 3
+#define X265_MAX_GOP_LENGTH 16
+#define MAX_T_LAYERS 7
+
+#define X265_IPRATIO_STRENGTH   1.43
 
 typedef struct x265_cli_csp
 {
@@ -696,6 +707,7 @@
 typedef struct x265_zone
 {
     int   startFrame, endFrame; /* range of frame numbers */
+    int   keyframeMax;          /* it store the default/user defined keyframeMax value*/
     int   bForceQp;             /* whether to use qp vs bitrate factor */
     int   qp;
     float bitrateFactor;
@@ -747,6 +759,271 @@
 
 static const x265_vmaf_commondata vcd = { { NULL, (char *)"/usr/local/share/model/vmaf_v0.6.1.pkl", NULL, NULL, 0, 0, 0, 0, 0, 0, 0, NULL, 0, 1, 0 } };
 
+typedef struct x265_temporal_layer {
+    int poc_offset;      /* POC offset */
+    int8_t layer;        /* Current layer */
+    int8_t qp_offset;    /* QP offset */
+} x265_temporal_layer;
+
+static const int8_t x265_temporal_layer_bframesMAX_T_LAYERS = {-1, -1, 3, 7, 15, -1, -1};
+
+static const int8_t x265_gop_ra_lengthX265_MAX_GOP_CONFIG = { 4, 8, 16};
+static const x265_temporal_layer x265_gop_raX265_MAX_GOP_CONFIGX265_MAX_GOP_LENGTH = {
+    {
+        {
+            4,
+            0,
+            1,
+        },
+        {
+            2,
+            1,
+            5,
+        },
+        {
+            1,
+            2,
+            3,
+        },
+        {
+            3,
+            2,
+            5,
+        },
+        {
+            -1,
+            -1,
+            -1,
+        },
+        {
+            -1,
+            -1,
+            -1,
+        },
+        {
+            -1,
+            -1,
+            -1,
+        },
+        {
+            -1,
+            -1,
+            -1,
+        },
+        {
+            -1,
+            -1,
+            -1,
+        },
+        {
+            -1,
+            -1,
+            -1,
+        },
+        {
+            -1,
+            -1,
+            -1,
+        },
+        {
+            -1,
+            -1,
+            -1,
+        },
+        {
+            -1,
+            -1,
+            -1,
+        },
+        {
+            -1,
+            -1,
+            -1,
+        },
+        {
+            -1,
+            -1,
+            -1,
+        },
+        {
+            -1,
+            -1,
+            -1,
+        }
+    },
+
+    {
+        {
+            8,
+            0,
+            1,
+        },
+        {
+            4,
+            1,
+            5,
+        },
+        {
+            2,
+            2,
+            4,
+        },
+        {
+            1,
+            3,
+            5,
+        },
+        {
+            3,
+            3,
+            2,
+        },
+        {
+            6,
+            2,
+            5,
+        },
+        {
+            5,
+            3,
+            4,
+        },
+        {
+            7,
+            3,
+            5,
+        },
+        {
+            -1,
+            -1,
+            -1,
+        },
+        {

 
@@ -26,6 +26,7 @@
 #define X265_H
 #include <stdint.h>
 #include <stdio.h>
+#include <sys/stat.h>
 #include "x265_config.h"
 #ifdef __cplusplus
 extern "C" {
@@ -59,7 +60,7 @@
     NAL_UNIT_CODED_SLICE_TRAIL_N = 0,
     NAL_UNIT_CODED_SLICE_TRAIL_R,
     NAL_UNIT_CODED_SLICE_TSA_N,
-    NAL_UNIT_CODED_SLICE_TLA_R,
+    NAL_UNIT_CODED_SLICE_TSA_R,
     NAL_UNIT_CODED_SLICE_STSA_N,
     NAL_UNIT_CODED_SLICE_STSA_R,
     NAL_UNIT_CODED_SLICE_RADL_N,
@@ -311,6 +312,7 @@
     double           vmafFrameScore;
     double           bufferFillFinal;
     double           unclippedBufferFillFinal;
+    uint8_t          tLayer;
 } x265_frame_stats;
 
 typedef struct x265_ctu_info_t
@@ -536,6 +538,8 @@
 /* ARM */
 #define X265_CPU_ARMV6           0x0000001
 #define X265_CPU_NEON            0x0000002  /* ARM NEON */
+#define X265_CPU_SVE2            0x0000008  /* ARM SVE2 */
+#define X265_CPU_SVE             0x0000010  /* ARM SVE2 */
 #define X265_CPU_FAST_NEON_MRC   0x0000004  /* Transfer from NEON to ARM register is fast (Cortex-A9) */
 
 /* IBM Power8 */
@@ -613,6 +617,13 @@
 #define SLICE_TYPE_DELTA        0.3 /* The offset decremented or incremented for P-frames or b-frames respectively*/
 #define BACKWARD_WINDOW         1 /* Scenecut window before a scenecut */
 #define FORWARD_WINDOW          2 /* Scenecut window after a scenecut */
+#define BWD_WINDOW_DELTA        0.4
+
+#define X265_MAX_GOP_CONFIG 3
+#define X265_MAX_GOP_LENGTH 16
+#define MAX_T_LAYERS 7
+
+#define X265_IPRATIO_STRENGTH   1.43
 
 typedef struct x265_cli_csp
 {
@@ -696,6 +707,7 @@
 typedef struct x265_zone
 {
     int   startFrame, endFrame; /* range of frame numbers */
+    int   keyframeMax;          /* it store the default/user defined keyframeMax value*/
     int   bForceQp;             /* whether to use qp vs bitrate factor */
     int   qp;
     float bitrateFactor;
@@ -747,6 +759,271 @@
 
 static const x265_vmaf_commondata vcd = { { NULL, (char *)"/usr/local/share/model/vmaf_v0.6.1.pkl", NULL, NULL, 0, 0, 0, 0, 0, 0, 0, NULL, 0, 1, 0 } };
 
+typedef struct x265_temporal_layer {
+    int poc_offset;      /* POC offset */
+    int8_t layer;        /* Current layer */
+    int8_t qp_offset;    /* QP offset */
+} x265_temporal_layer;
+
+static const int8_t x265_temporal_layer_bframesMAX_T_LAYERS = {-1, -1, 3, 7, 15, -1, -1};
+
+static const int8_t x265_gop_ra_lengthX265_MAX_GOP_CONFIG = { 4, 8, 16};
+static const x265_temporal_layer x265_gop_raX265_MAX_GOP_CONFIGX265_MAX_GOP_LENGTH = {
+    {
+        {
+            4,
+            0,
+            1,
+        },
+        {
+            2,
+            1,
+            5,
+        },
+        {
+            1,
+            2,
+            3,
+        },
+        {
+            3,
+            2,
+            5,
+        },
+        {
+            -1,
+            -1,
+            -1,
+        },
+        {
+            -1,
+            -1,
+            -1,
+        },
+        {
+            -1,
+            -1,
+            -1,
+        },
+        {
+            -1,
+            -1,
+            -1,
+        },
+        {
+            -1,
+            -1,
+            -1,
+        },
+        {
+            -1,
+            -1,
+            -1,
+        },
+        {
+            -1,
+            -1,
+            -1,
+        },
+        {
+            -1,
+            -1,
+            -1,
+        },
+        {
+            -1,
+            -1,
+            -1,
+        },
+        {
+            -1,
+            -1,
+            -1,
+        },
+        {
+            -1,
+            -1,
+            -1,
+        },
+        {
+            -1,
+            -1,
+            -1,
+        }
+    },
+
+    {
+        {
+            8,
+            0,
+            1,
+        },
+        {
+            4,
+            1,
+            5,
+        },
+        {
+            2,
+            2,
+            4,
+        },
+        {
+            1,
+            3,
+            5,
+        },
+        {
+            3,
+            3,
+            2,
+        },
+        {
+            6,
+            2,
+            5,
+        },
+        {
+            5,
+            3,
+            4,
+        },
+        {
+            7,
+            3,
+            5,
+        },
+        {
+            -1,
+            -1,
+            -1,
+        },
+        {
​

x265_3.5.tar.gz/source/x265cli.cpp -> x265_3.6.tar.gz/source/x265cli.cpp Changed

@@ -28,8 +28,8 @@
 #include "x265cli.h"
 #include "svt.h"
 
-#define START_CODE 0x00000001
-#define START_CODE_BYTES 4
+#define START_CODE 0x00000001
+#define START_CODE_BYTES 4
 
 #ifdef __cplusplus
 namespace X265_NS {
@@ -166,6 +166,7 @@
         H0("   --rdpenalty <0..2>            penalty for 32x32 intra TU in non-I slices. 0:disabled 1:RD-penalty 2:maximum. Default %d\n", param->rdPenalty);
         H0("\nSlice decision options:\n");
         H0("   --no-open-gop               Enable open-GOP, allows I slices to be non-IDR. Default %s\n", OPT(param->bOpenGOP));
+		H0("   --cra-nal                     Force nal type to CRA to all frames expect first frame, works only with keyint 1. Default %s\n", OPT(param->craNal));
         H0("-I/--keyint <integer>            Max IDR period in frames. -1 for infinite-gop. Default %d\n", param->keyframeMax);
         H0("-i/--min-keyint <integer>        Scenecuts closer together than this are coded as I, not IDR. Default: auto\n");
         H0("   --gop-lookahead <integer>     Extends gop boundary if a scenecut is found within this from keyint boundary. Default 0\n");
@@ -174,7 +175,6 @@
         H1("   --scenecut-bias <0..100.0>    Bias for scenecut detection. Default %.2f\n", param->scenecutBias);
         H0("   --hist-scenecut               Enables histogram based scene-cut detection using histogram based algorithm.\n");
         H0("   --no-hist-scenecut            Disables histogram based scene-cut detection using histogram based algorithm.\n");
-        H1("   --hist-threshold <0.0..1.0>   Luma Edge histogram's Normalized SAD threshold for histogram based scenecut detection Default %.2f\n", param->edgeTransitionThreshold);
         H0("   --no-fades                  Enable detection and handling of fade-in regions. Default %s\n", OPT(param->bEnableFades));
         H1("   --scenecut-aware-qp <0..3>    Enable increasing QP for frames inside the scenecut window around scenecut. Default %s\n", OPT(param->bEnableSceneCutAwareQp));
         H1("                                 0 - Disabled\n");
@@ -182,6 +182,7 @@
         H1("                                 2 - Backward masking\n");
         H1("                                 3 - Bidirectional masking\n");
         H1("   --masking-strength <string>   Comma separated values which specify the duration and offset for the QP increment for inter-frames when scenecut-aware-qp is enabled.\n");
+        H1("   --scenecut-qp-config <file>   File containing scenecut-aware-qp mode, window duration and offsets settings required for the masking. Works only with --pass 2\n");
         H0("   --radl <integer>              Number of RADL pictures allowed in front of IDR. Default %d\n", param->radl);
         H0("   --intra-refresh               Use Periodic Intra Refresh instead of IDR frames\n");
         H0("   --rc-lookahead <integer>      Number of frames for frame-type lookahead (determines encoder latency) Default %d\n", param->lookaheadDepth);
@@ -262,6 +263,7 @@
         H0("   --aq-strength <float>         Reduces blocking and blurring in flat and textured areas (0 to 3.0). Default %.2f\n", param->rc.aqStrength);
         H0("   --qp-adaptation-range <float> Delta QP range by QP adaptation based on a psycho-visual model (1.0 to 6.0). Default %.2f\n", param->rc.qpAdaptationRange);
         H0("   --no-aq-motion              Block level QP adaptation based on the relative motion between the block and the frame. Default %s\n", OPT(param->bAQMotion));
+        H1("   --no-sbrc                   Enables the segment based rate control. Default %s\n", OPT(param->bEnableSBRC));
         H0("   --qg-size <int>               Specifies the size of the quantization group (64, 32, 16, 8). Default %d\n", param->rc.qgSize);
         H0("   --no-cutree                 Enable cutree for Adaptive Quantization. Default %s\n", OPT(param->rc.cuTree));
         H0("   --no-rc-grain               Enable ratecontrol mode to handle grains specifically. turned on with tune grain. Default %s\n", OPT(param->rc.bEnableGrain));
@@ -282,6 +284,7 @@
         H1("                                       q=<integer> (force QP)\n");
         H1("                                   or  b=<float> (bitrate multiplier)\n");
         H0("   --zonefile <filename>         Zone file containing the zone boundaries and the parameters to be reconfigured.\n");
+        H0("   --no-zonefile-rc-init         This allow to use rate-control history across zones in zonefile.\n");
         H1("   --lambda-file <string>        Specify a file containing replacement values for the lambda tables\n");
         H1("                                 MAX_MAX_QP+1 floats for lambda table, then again for lambda2 table\n");
         H1("                                 Blank lines and lines starting with hash(#) are ignored\n");
@@ -314,6 +317,30 @@
         H0("   --master-display <string>     SMPTE ST 2086 master display color volume info SEI (HDR)\n");
         H0("                                    format: G(x,y)B(x,y)R(x,y)WP(x,y)L(max,min)\n");
         H0("   --max-cll <string>            Specify content light level info SEI as \"cll,fall\" (HDR).\n");
+        H0("   --video-signal-type-preset <string>    Specify combinations of color primaries, transfer characteristics, color matrix, range of luma and chroma signals, and chroma sample location\n");
+        H0("                                            format: <system-id>:<color-volume>\n");
+        H0("                                            This has higher precedence than individual VUI parameters. If any individual VUI option is specified together with this,\n");
+        H0("                                            which changes the values set corresponding to the system-id or color-volume, it will be discarded.\n");
+        H0("                                            The color-volume can be used only with the system-id options BT2100_PQ_YCC, BT2100_PQ_ICTCP, and BT2100_PQ_RGB.\n");
+        H0("                                            system-id options and their corresponding values:\n");
+        H0("                                              BT601_525:       --colorprim smpte170m --transfer smpte170m --colormatrix smpte170m --range limited --chromaloc 0\n");
+        H0("                                              BT601_626:       --colorprim bt470bg --transfer smpte170m --colormatrix bt470bg --range limited --chromaloc 0\n");
+        H0("                                              BT709_YCC:       --colorprim bt709 --transfer bt709 --colormatrix bt709 --range limited --chromaloc 0\n");
+        H0("                                              BT709_RGB:       --colorprim bt709 --transfer bt709 --colormatrix gbr --range limited\n");
+        H0("                                              BT2020_YCC_NCL:  --colorprim bt2020 --transfer bt2020-10 --colormatrix bt709 --range limited --chromaloc 2\n");
+        H0("                                              BT2020_RGB:      --colorprim bt2020 --transfer smpte2084 --colormatrix bt2020nc --range limited\n");
+        H0("                                              BT2100_PQ_YCC:   --colorprim bt2020 --transfer smpte2084 --colormatrix bt2020nc --range limited --chromaloc 2\n");
+        H0("                                              BT2100_PQ_ICTCP: --colorprim bt2020 --transfer smpte2084 --colormatrix ictcp --range limited --chromaloc 2\n");
+        H0("                                              BT2100_PQ_RGB:   --colorprim bt2020 --transfer smpte2084 --colormatrix gbr --range limited\n");
+        H0("                                              BT2100_HLG_YCC:  --colorprim bt2020 --transfer arib-std-b67 --colormatrix bt2020nc --range limited --chromaloc 2\n");
+        H0("                                              BT2100_HLG_RGB:  --colorprim bt2020 --transfer arib-std-b67 --colormatrix gbr --range limited\n");
+        H0("                                              FR709_RGB:       --colorprim bt709 --transfer bt709 --colormatrix gbr --range full\n");
+        H0("                                              FR2020_RGB:      --colorprim bt2020 --transfer bt2020-10 --colormatrix gbr --range full\n");
+        H0("                                              FRP3D65_YCC:     --colorprim smpte432 --transfer bt709 --colormatrix smpte170m --range full --chromaloc 1\n");
+        H0("                                            color-volume options and their corresponding values:\n");
+        H0("                                              P3D65x1000n0005: --master-display G(13250,34500)B(7500,3000)R(34000,16000)WP(15635,16450)L(10000000,5)\n");
+        H0("                                              P3D65x4000n005:  --master-display G(13250,34500)B(7500,3000)R(34000,16000)WP(15635,16450)L(40000000,50)\n");
+        H0("                                              BT2100x108n0005: --master-display G(8500,39850)B(6550,2300)R(34000,146000)WP(15635,16450)L(10000000,1)\n");
         H0("   --no-cll                    Emit content light level info SEI. Default %s\n", OPT(param->bEmitCLL));
         H0("   --no-hdr10                  Control dumping of HDR10 SEI packet. If max-cll or master-display has non-zero values, this is enabled. Default %s\n", OPT(param->bEmitHDR10SEI));
         H0("   --no-hdr-opt                Add luma and chroma offsets for HDR/WCG content. Default %s. Now deprecated.\n", OPT(param->bHDROpt));
@@ -324,9 +351,11 @@
         H0("   --no-repeat-headers         Emit SPS and PPS headers at each keyframe. Default %s\n", OPT(param->bRepeatHeaders));
         H0("   --no-info                   Emit SEI identifying encoder and parameters. Default %s\n", OPT(param->bEmitInfoSEI));
         H0("   --no-hrd                    Enable HRD parameters signaling. Default %s\n", OPT(param->bEmitHRDSEI));
-        H0("   --no-idr-recovery-sei      Emit recovery point infor SEI at each IDR frame \n");
-        H0("   --no-temporal-layers        Enable a temporal sublayer for unreferenced B frames. Default %s\n", OPT(param->bEnableTemporalSubLayers));
+        H0("   --no-idr-recovery-sei       Emit recovery point infor SEI at each IDR frame \n");
+        H0("   --temporal-layers             Enable a temporal sublayer for unreferenced B frames. Default %s\n", OPT(param->bEnableTemporalSubLayers));
         H0("   --no-aud                    Emit access unit delimiters at the start of each access unit. Default %s\n", OPT(param->bEnableAccessUnitDelimiters));
+        H0("   --no-eob                    Emit end of bitstream nal unit at the end of the bitstream. Default %s\n", OPT(param->bEnableEndOfBitstream));
+        H0("   --no-eos                    Emit end of sequence nal unit at the end of every coded video sequence. Default %s\n", OPT(param->bEnableEndOfSequence));
         H1("   --hash <integer>              Decoded Picture Hash SEI 0: disabled, 1: MD5, 2: CRC, 3: Checksum. Default %d\n", param->decodedPictureHashSEI);
         H0("   --atc-sei <integer>           Emit the alternative transfer characteristics SEI message where the integer is the preferred transfer characteristics. Default disabled\n");
         H0("   --pic-struct <integer>        Set the picture structure and emits it in the picture timing SEI message. Values in the range 0..12. See D.3.3 of the HEVC spec. for a detailed explanation.\n");
@@ -344,6 +373,7 @@
         H0("   --lowpass-dct                 Use low-pass subband dct approximation. Default %s\n", OPT(param->bLowPassDct));
         H0("   --no-frame-dup              Enable Frame duplication. Default %s\n", OPT(param->bEnableFrameDuplication));
         H0("   --dup-threshold <integer>     PSNR threshold for Frame duplication. Default %d\n", param->dupThreshold);
+        H0("   --no-mcstf                  Enable GOP based temporal filter. Default %d\n", param->bEnableTemporalFilter);
 #ifdef SVT_HEVC
         H0("   --nosvt                     Enable SVT HEVC encoder %s\n", OPT(param->bEnableSvtHevc));
         H0("   --no-svt-hme                Enable Hierarchial motion estimation(HME) in SVT HEVC encoder \n");
@@ -365,6 +395,9 @@
         H1("    2 - unable to open encoder\n");
         H1("    3 - unable to generate stream headers\n");
         H1("    4 - encoder abort\n");
+        H0("\nSEI Message Options\n");
+        H0("   --film-grain <filename>           File containing Film Grain Characteristics to be written as a SEI Message\n");
+
 #undef OPT
 #undef H0
 #undef H1
@@ -484,6 +517,9 @@
 
         memcpy(globalParam->rc.zoneszonefileCount.zoneParam, globalParam, sizeof(x265_param));
 
+        if (zonefileCount == 0)
+            globalParam->rc.zoneszonefileCount.keyframeMax = globalParam->keyframeMax;
+
         for (optind = 0;;)
         {
             int long_options_index = -1;
@@ -708,12 +744,19 @@
                         return true;
                     }
                 }
+                OPT("scenecut-qp-config")
+                {
+                    this->scenecutAwareQpConfig = x265_fopen(optarg, "rb");
+                    if (!this->scenecutAwareQpConfig)
+                        x265_log_file(param, X265_LOG_ERROR, "%s scenecut aware qp config file not found or error in opening config file\n", optarg);
+                }
                 OPT("zonefile")
                 {
                     this->zoneFile = x265_fopen(optarg, "rb");
                     if (!this->zoneFile)
                         x265_log_file(param, X265_LOG_ERROR, "%s zone file not found or error in opening zone file\n", optarg);
                 }
+                OPT("no-zonefile-rc-init") this->param->bNoResetZoneConfig = true;
                 OPT("fullhelp")
                 {
                     param->logLevel = X265_LOG_FULL;
@@ -875,7 +918,7 @@
             if (reconFileBitDepth == 0)
                 reconFileBitDepth = param->internalBitDepth;
             this->recon = ReconFile::open(reconfn, param->sourceWidth, param->sourceHeight, reconFileBitDepth,
-                param->fpsNum, param->fpsDenom, param->internalCsp);
+                param->fpsNum, param->fpsDenom, param->internalCsp, param->sourceBitDepth);
             if (this->recon->isFail())
             {
                 x265_log(param, X265_LOG_WARNING, "unable to write reconstructed outputs file\n");
@@ -973,6 +1016,7 @@
         param->rc.zones = X265_MALLOC(x265_zone, param->rc.zonefileCount);
         for (int i = 0; i < param->rc.zonefileCount; i++)
         {
+            param->rc.zonesi.startFrame = -1;
             while (fgets(line, sizeof(line), zoneFile))
             {
                 if (*line == '#' || (strcmp(line, "\r\n") == 0))
@@ -1010,57 +1054,179 @@
         return 1;
     }
 
-    /* Parse the RPU file and extract the RPU corresponding to the current picture
-    * and fill the rpu field of the input picture */
-    int CLIOptions::rpuParser(x265_picture * pic)
-    {
-        uint8_t byteVal;
-        uint32_t code = 0;
-        int bytesRead = 0;
-        pic->rpu.payloadSize = 0;
-
-        if (!pic->pts)
-        {
-            while (bytesRead++ < 4 && fread(&byteVal, sizeof(uint8_t), 1, dolbyVisionRpu))
-                code = (code << 8) | byteVal;
-
-            if (code != START_CODE)
-            {
-                x265_log(NULL, X265_LOG_ERROR, "Invalid Dolby Vision RPU startcode in POC %d\n", pic->pts);
-                return 1;
-            }
-        }
-
-        bytesRead = 0;
-        while (fread(&byteVal, sizeof(uint8_t), 1, dolbyVisionRpu))
-        {
-            code = (code << 8) | byteVal;
-            if (bytesRead++ < 3)
-                continue;
-            if (bytesRead >= 1024)
-            {
-                x265_log(NULL, X265_LOG_ERROR, "Invalid Dolby Vision RPU size in POC %d\n", pic->pts);
-                return 1;
-            }
-
-            if (code != START_CODE)
-                pic->rpu.payloadpic->rpu.payloadSize++ = (code >> (3 * 8)) & 0xFF;

 
@@ -28,8 +28,8 @@
 #include "x265cli.h"
 #include "svt.h"
 
-#define START_CODE 0x00000001
-#define START_CODE_BYTES 4
+#define START_CODE 0x00000001
+#define START_CODE_BYTES 4
 
 #ifdef __cplusplus
 namespace X265_NS {
@@ -166,6 +166,7 @@
         H0("   --rdpenalty <0..2>            penalty for 32x32 intra TU in non-I slices. 0:disabled 1:RD-penalty 2:maximum. Default %d\n", param->rdPenalty);
         H0("\nSlice decision options:\n");
         H0("   --no-open-gop               Enable open-GOP, allows I slices to be non-IDR. Default %s\n", OPT(param->bOpenGOP));
+       H0("   --cra-nal                     Force nal type to CRA to all frames expect first frame, works only with keyint 1. Default %s\n", OPT(param->craNal));
         H0("-I/--keyint <integer>            Max IDR period in frames. -1 for infinite-gop. Default %d\n", param->keyframeMax);
         H0("-i/--min-keyint <integer>        Scenecuts closer together than this are coded as I, not IDR. Default: auto\n");
         H0("   --gop-lookahead <integer>     Extends gop boundary if a scenecut is found within this from keyint boundary. Default 0\n");
@@ -174,7 +175,6 @@
         H1("   --scenecut-bias <0..100.0>    Bias for scenecut detection. Default %.2f\n", param->scenecutBias);
         H0("   --hist-scenecut               Enables histogram based scene-cut detection using histogram based algorithm.\n");
         H0("   --no-hist-scenecut            Disables histogram based scene-cut detection using histogram based algorithm.\n");
-        H1("   --hist-threshold <0.0..1.0>   Luma Edge histogram's Normalized SAD threshold for histogram based scenecut detection Default %.2f\n", param->edgeTransitionThreshold);
         H0("   --no-fades                  Enable detection and handling of fade-in regions. Default %s\n", OPT(param->bEnableFades));
         H1("   --scenecut-aware-qp <0..3>    Enable increasing QP for frames inside the scenecut window around scenecut. Default %s\n", OPT(param->bEnableSceneCutAwareQp));
         H1("                                 0 - Disabled\n");
@@ -182,6 +182,7 @@
         H1("                                 2 - Backward masking\n");
         H1("                                 3 - Bidirectional masking\n");
         H1("   --masking-strength <string>   Comma separated values which specify the duration and offset for the QP increment for inter-frames when scenecut-aware-qp is enabled.\n");
+        H1("   --scenecut-qp-config <file>   File containing scenecut-aware-qp mode, window duration and offsets settings required for the masking. Works only with --pass 2\n");
         H0("   --radl <integer>              Number of RADL pictures allowed in front of IDR. Default %d\n", param->radl);
         H0("   --intra-refresh               Use Periodic Intra Refresh instead of IDR frames\n");
         H0("   --rc-lookahead <integer>      Number of frames for frame-type lookahead (determines encoder latency) Default %d\n", param->lookaheadDepth);
@@ -262,6 +263,7 @@
         H0("   --aq-strength <float>         Reduces blocking and blurring in flat and textured areas (0 to 3.0). Default %.2f\n", param->rc.aqStrength);
         H0("   --qp-adaptation-range <float> Delta QP range by QP adaptation based on a psycho-visual model (1.0 to 6.0). Default %.2f\n", param->rc.qpAdaptationRange);
         H0("   --no-aq-motion              Block level QP adaptation based on the relative motion between the block and the frame. Default %s\n", OPT(param->bAQMotion));
+        H1("   --no-sbrc                   Enables the segment based rate control. Default %s\n", OPT(param->bEnableSBRC));
         H0("   --qg-size <int>               Specifies the size of the quantization group (64, 32, 16, 8). Default %d\n", param->rc.qgSize);
         H0("   --no-cutree                 Enable cutree for Adaptive Quantization. Default %s\n", OPT(param->rc.cuTree));
         H0("   --no-rc-grain               Enable ratecontrol mode to handle grains specifically. turned on with tune grain. Default %s\n", OPT(param->rc.bEnableGrain));
@@ -282,6 +284,7 @@
         H1("                                       q=<integer> (force QP)\n");
         H1("                                   or  b=<float> (bitrate multiplier)\n");
         H0("   --zonefile <filename>         Zone file containing the zone boundaries and the parameters to be reconfigured.\n");
+        H0("   --no-zonefile-rc-init         This allow to use rate-control history across zones in zonefile.\n");
         H1("   --lambda-file <string>        Specify a file containing replacement values for the lambda tables\n");
         H1("                                 MAX_MAX_QP+1 floats for lambda table, then again for lambda2 table\n");
         H1("                                 Blank lines and lines starting with hash(#) are ignored\n");
@@ -314,6 +317,30 @@
         H0("   --master-display <string>     SMPTE ST 2086 master display color volume info SEI (HDR)\n");
         H0("                                    format: G(x,y)B(x,y)R(x,y)WP(x,y)L(max,min)\n");
         H0("   --max-cll <string>            Specify content light level info SEI as \"cll,fall\" (HDR).\n");
+        H0("   --video-signal-type-preset <string>    Specify combinations of color primaries, transfer characteristics, color matrix, range of luma and chroma signals, and chroma sample location\n");
+        H0("                                            format: <system-id>:<color-volume>\n");
+        H0("                                            This has higher precedence than individual VUI parameters. If any individual VUI option is specified together with this,\n");
+        H0("                                            which changes the values set corresponding to the system-id or color-volume, it will be discarded.\n");
+        H0("                                            The color-volume can be used only with the system-id options BT2100_PQ_YCC, BT2100_PQ_ICTCP, and BT2100_PQ_RGB.\n");
+        H0("                                            system-id options and their corresponding values:\n");
+        H0("                                              BT601_525:       --colorprim smpte170m --transfer smpte170m --colormatrix smpte170m --range limited --chromaloc 0\n");
+        H0("                                              BT601_626:       --colorprim bt470bg --transfer smpte170m --colormatrix bt470bg --range limited --chromaloc 0\n");
+        H0("                                              BT709_YCC:       --colorprim bt709 --transfer bt709 --colormatrix bt709 --range limited --chromaloc 0\n");
+        H0("                                              BT709_RGB:       --colorprim bt709 --transfer bt709 --colormatrix gbr --range limited\n");
+        H0("                                              BT2020_YCC_NCL:  --colorprim bt2020 --transfer bt2020-10 --colormatrix bt709 --range limited --chromaloc 2\n");
+        H0("                                              BT2020_RGB:      --colorprim bt2020 --transfer smpte2084 --colormatrix bt2020nc --range limited\n");
+        H0("                                              BT2100_PQ_YCC:   --colorprim bt2020 --transfer smpte2084 --colormatrix bt2020nc --range limited --chromaloc 2\n");
+        H0("                                              BT2100_PQ_ICTCP: --colorprim bt2020 --transfer smpte2084 --colormatrix ictcp --range limited --chromaloc 2\n");
+        H0("                                              BT2100_PQ_RGB:   --colorprim bt2020 --transfer smpte2084 --colormatrix gbr --range limited\n");
+        H0("                                              BT2100_HLG_YCC:  --colorprim bt2020 --transfer arib-std-b67 --colormatrix bt2020nc --range limited --chromaloc 2\n");
+        H0("                                              BT2100_HLG_RGB:  --colorprim bt2020 --transfer arib-std-b67 --colormatrix gbr --range limited\n");
+        H0("                                              FR709_RGB:       --colorprim bt709 --transfer bt709 --colormatrix gbr --range full\n");
+        H0("                                              FR2020_RGB:      --colorprim bt2020 --transfer bt2020-10 --colormatrix gbr --range full\n");
+        H0("                                              FRP3D65_YCC:     --colorprim smpte432 --transfer bt709 --colormatrix smpte170m --range full --chromaloc 1\n");
+        H0("                                            color-volume options and their corresponding values:\n");
+        H0("                                              P3D65x1000n0005: --master-display G(13250,34500)B(7500,3000)R(34000,16000)WP(15635,16450)L(10000000,5)\n");
+        H0("                                              P3D65x4000n005:  --master-display G(13250,34500)B(7500,3000)R(34000,16000)WP(15635,16450)L(40000000,50)\n");
+        H0("                                              BT2100x108n0005: --master-display G(8500,39850)B(6550,2300)R(34000,146000)WP(15635,16450)L(10000000,1)\n");
         H0("   --no-cll                    Emit content light level info SEI. Default %s\n", OPT(param->bEmitCLL));
         H0("   --no-hdr10                  Control dumping of HDR10 SEI packet. If max-cll or master-display has non-zero values, this is enabled. Default %s\n", OPT(param->bEmitHDR10SEI));
         H0("   --no-hdr-opt                Add luma and chroma offsets for HDR/WCG content. Default %s. Now deprecated.\n", OPT(param->bHDROpt));
@@ -324,9 +351,11 @@
         H0("   --no-repeat-headers         Emit SPS and PPS headers at each keyframe. Default %s\n", OPT(param->bRepeatHeaders));
         H0("   --no-info                   Emit SEI identifying encoder and parameters. Default %s\n", OPT(param->bEmitInfoSEI));
         H0("   --no-hrd                    Enable HRD parameters signaling. Default %s\n", OPT(param->bEmitHRDSEI));
-        H0("   --no-idr-recovery-sei      Emit recovery point infor SEI at each IDR frame \n");
-        H0("   --no-temporal-layers        Enable a temporal sublayer for unreferenced B frames. Default %s\n", OPT(param->bEnableTemporalSubLayers));
+        H0("   --no-idr-recovery-sei       Emit recovery point infor SEI at each IDR frame \n");
+        H0("   --temporal-layers             Enable a temporal sublayer for unreferenced B frames. Default %s\n", OPT(param->bEnableTemporalSubLayers));
         H0("   --no-aud                    Emit access unit delimiters at the start of each access unit. Default %s\n", OPT(param->bEnableAccessUnitDelimiters));
+        H0("   --no-eob                    Emit end of bitstream nal unit at the end of the bitstream. Default %s\n", OPT(param->bEnableEndOfBitstream));
+        H0("   --no-eos                    Emit end of sequence nal unit at the end of every coded video sequence. Default %s\n", OPT(param->bEnableEndOfSequence));
         H1("   --hash <integer>              Decoded Picture Hash SEI 0: disabled, 1: MD5, 2: CRC, 3: Checksum. Default %d\n", param->decodedPictureHashSEI);
         H0("   --atc-sei <integer>           Emit the alternative transfer characteristics SEI message where the integer is the preferred transfer characteristics. Default disabled\n");
         H0("   --pic-struct <integer>        Set the picture structure and emits it in the picture timing SEI message. Values in the range 0..12. See D.3.3 of the HEVC spec. for a detailed explanation.\n");
@@ -344,6 +373,7 @@
         H0("   --lowpass-dct                 Use low-pass subband dct approximation. Default %s\n", OPT(param->bLowPassDct));
         H0("   --no-frame-dup              Enable Frame duplication. Default %s\n", OPT(param->bEnableFrameDuplication));
         H0("   --dup-threshold <integer>     PSNR threshold for Frame duplication. Default %d\n", param->dupThreshold);
+        H0("   --no-mcstf                  Enable GOP based temporal filter. Default %d\n", param->bEnableTemporalFilter);
 #ifdef SVT_HEVC
         H0("   --nosvt                     Enable SVT HEVC encoder %s\n", OPT(param->bEnableSvtHevc));
         H0("   --no-svt-hme                Enable Hierarchial motion estimation(HME) in SVT HEVC encoder \n");
@@ -365,6 +395,9 @@
         H1("    2 - unable to open encoder\n");
         H1("    3 - unable to generate stream headers\n");
         H1("    4 - encoder abort\n");
+        H0("\nSEI Message Options\n");
+        H0("   --film-grain <filename>           File containing Film Grain Characteristics to be written as a SEI Message\n");
+
 #undef OPT
 #undef H0
 #undef H1
@@ -484,6 +517,9 @@
 
         memcpy(globalParam->rc.zoneszonefileCount.zoneParam, globalParam, sizeof(x265_param));
 
+        if (zonefileCount == 0)
+            globalParam->rc.zoneszonefileCount.keyframeMax = globalParam->keyframeMax;
+
         for (optind = 0;;)
         {
             int long_options_index = -1;
@@ -708,12 +744,19 @@
                         return true;
                     }
                 }
+                OPT("scenecut-qp-config")
+                {
+                    this->scenecutAwareQpConfig = x265_fopen(optarg, "rb");
+                    if (!this->scenecutAwareQpConfig)
+                        x265_log_file(param, X265_LOG_ERROR, "%s scenecut aware qp config file not found or error in opening config file\n", optarg);
+                }
                 OPT("zonefile")
                 {
                     this->zoneFile = x265_fopen(optarg, "rb");
                     if (!this->zoneFile)
                         x265_log_file(param, X265_LOG_ERROR, "%s zone file not found or error in opening zone file\n", optarg);
                 }
+                OPT("no-zonefile-rc-init") this->param->bNoResetZoneConfig = true;
                 OPT("fullhelp")
                 {
                     param->logLevel = X265_LOG_FULL;
@@ -875,7 +918,7 @@
             if (reconFileBitDepth == 0)
                 reconFileBitDepth = param->internalBitDepth;
             this->recon = ReconFile::open(reconfn, param->sourceWidth, param->sourceHeight, reconFileBitDepth,
-                param->fpsNum, param->fpsDenom, param->internalCsp);
+                param->fpsNum, param->fpsDenom, param->internalCsp, param->sourceBitDepth);
             if (this->recon->isFail())
             {
                 x265_log(param, X265_LOG_WARNING, "unable to write reconstructed outputs file\n");
@@ -973,6 +1016,7 @@
         param->rc.zones = X265_MALLOC(x265_zone, param->rc.zonefileCount);
         for (int i = 0; i < param->rc.zonefileCount; i++)
         {
+            param->rc.zonesi.startFrame = -1;
             while (fgets(line, sizeof(line), zoneFile))
             {
                 if (*line == '#' || (strcmp(line, "\r\n") == 0))
@@ -1010,57 +1054,179 @@
         return 1;
     }
 
-    /* Parse the RPU file and extract the RPU corresponding to the current picture
-    * and fill the rpu field of the input picture */
-    int CLIOptions::rpuParser(x265_picture * pic)
-    {
-        uint8_t byteVal;
-        uint32_t code = 0;
-        int bytesRead = 0;
-        pic->rpu.payloadSize = 0;
-
-        if (!pic->pts)
-        {
-            while (bytesRead++ < 4 && fread(&byteVal, sizeof(uint8_t), 1, dolbyVisionRpu))
-                code = (code << 8) | byteVal;
-
-            if (code != START_CODE)
-            {
-                x265_log(NULL, X265_LOG_ERROR, "Invalid Dolby Vision RPU startcode in POC %d\n", pic->pts);
-                return 1;
-            }
-        }
-
-        bytesRead = 0;
-        while (fread(&byteVal, sizeof(uint8_t), 1, dolbyVisionRpu))
-        {
-            code = (code << 8) | byteVal;
-            if (bytesRead++ < 3)
-                continue;
-            if (bytesRead >= 1024)
-            {
-                x265_log(NULL, X265_LOG_ERROR, "Invalid Dolby Vision RPU size in POC %d\n", pic->pts);
-                return 1;
-            }
-
-            if (code != START_CODE)
-                pic->rpu.payloadpic->rpu.payloadSize++ = (code >> (3 * 8)) & 0xFF;
​

x265_3.5.tar.gz/source/x265cli.h -> x265_3.6.tar.gz/source/x265cli.h Changed

@@ -135,6 +135,7 @@
     { "no-fast-intra",        no_argument, NULL, 0 },
     { "no-open-gop",          no_argument, NULL, 0 },
     { "open-gop",             no_argument, NULL, 0 },
+    { "cra-nal",              no_argument, NULL, 0 },
     { "keyint",         required_argument, NULL, 'I' },
     { "min-keyint",     required_argument, NULL, 'i' },
     { "gop-lookahead",  required_argument, NULL, 0 },
@@ -143,7 +144,6 @@
     { "scenecut-bias",  required_argument, NULL, 0 },
     { "hist-scenecut",        no_argument, NULL, 0},
     { "no-hist-scenecut",     no_argument, NULL, 0},
-    { "hist-threshold", required_argument, NULL, 0},
     { "fades",                no_argument, NULL, 0 },
     { "no-fades",             no_argument, NULL, 0 },
     { "scenecut-aware-qp", required_argument, NULL, 0 },
@@ -182,6 +182,8 @@
     { "qp",             required_argument, NULL, 'q' },
     { "aq-mode",        required_argument, NULL, 0 },
     { "aq-strength",    required_argument, NULL, 0 },
+    { "sbrc",                 no_argument, NULL, 0 },
+    { "no-sbrc",              no_argument, NULL, 0 },
     { "rc-grain",             no_argument, NULL, 0 },
     { "no-rc-grain",          no_argument, NULL, 0 },
     { "ipratio",        required_argument, NULL, 0 },
@@ -244,6 +246,7 @@
     { "crop-rect",      required_argument, NULL, 0 }, /* DEPRECATED */
     { "master-display", required_argument, NULL, 0 },
     { "max-cll",        required_argument, NULL, 0 },
+    {"video-signal-type-preset", required_argument, NULL, 0 },
     { "min-luma",       required_argument, NULL, 0 },
     { "max-luma",       required_argument, NULL, 0 },
     { "log2-max-poc-lsb", required_argument, NULL, 8 },
@@ -263,11 +266,16 @@
     { "repeat-headers",       no_argument, NULL, 0 },
     { "aud",                  no_argument, NULL, 0 },
     { "no-aud",               no_argument, NULL, 0 },
+    { "eob",                  no_argument, NULL, 0 },
+    { "no-eob",               no_argument, NULL, 0 },
+    { "eos",                  no_argument, NULL, 0 },
+    { "no-eos",               no_argument, NULL, 0 },
     { "info",                 no_argument, NULL, 0 },
     { "no-info",              no_argument, NULL, 0 },
     { "zones",          required_argument, NULL, 0 },
     { "qpfile",         required_argument, NULL, 0 },
     { "zonefile",       required_argument, NULL, 0 },
+    { "no-zonefile-rc-init",  no_argument, NULL, 0 },
     { "lambda-file",    required_argument, NULL, 0 },
     { "b-intra",              no_argument, NULL, 0 },
     { "no-b-intra",           no_argument, NULL, 0 },
@@ -298,8 +306,7 @@
     { "dynamic-refine",       no_argument, NULL, 0 },
     { "no-dynamic-refine",    no_argument, NULL, 0 },
     { "strict-cbr",           no_argument, NULL, 0 },
-    { "temporal-layers",      no_argument, NULL, 0 },
-    { "no-temporal-layers",   no_argument, NULL, 0 },
+    { "temporal-layers",      required_argument, NULL, 0 },
     { "qg-size",        required_argument, NULL, 0 },
     { "recon-y4m-exec", required_argument, NULL, 0 },
     { "analyze-src-pics", no_argument, NULL, 0 },
@@ -349,6 +356,8 @@
     { "frame-dup",            no_argument, NULL, 0 },
     { "no-frame-dup", no_argument, NULL, 0 },
     { "dup-threshold", required_argument, NULL, 0 },
+    { "mcstf",                 no_argument, NULL, 0 },
+    { "no-mcstf",              no_argument, NULL, 0 },
 #ifdef SVT_HEVC
     { "svt",     no_argument, NULL, 0 },
     { "no-svt",  no_argument, NULL, 0 },
@@ -373,6 +382,8 @@
     { "abr-ladder", required_argument, NULL, 0 },
     { "min-vbv-fullness", required_argument, NULL, 0 },
     { "max-vbv-fullness", required_argument, NULL, 0 },
+    { "scenecut-qp-config", required_argument, NULL, 0 },
+    { "film-grain", required_argument, NULL, 0 },
     { 0, 0, 0, 0 },
     { 0, 0, 0, 0 },
     { 0, 0, 0, 0 },
@@ -388,6 +399,7 @@
         FILE*       qpfile;
         FILE*       zoneFile;
         FILE*    dolbyVisionRpu;    /* File containing Dolby Vision BL RPU metadata */
+        FILE*    scenecutAwareQpConfig; /* File containing scenecut aware frame quantization related CLI options */
         const char* reconPlayCmd;
         const x265_api* api;
         x265_param* param;
@@ -425,6 +437,7 @@
             qpfile = NULL;
             zoneFile = NULL;
             dolbyVisionRpu = NULL;
+            scenecutAwareQpConfig = NULL;
             reconPlayCmd = NULL;
             api = NULL;
             param = NULL;
@@ -455,6 +468,8 @@
         bool parseQPFile(x265_picture &pic_org);
         bool parseZoneFile();
         int rpuParser(x265_picture * pic);
+        bool parseScenecutAwareQpConfig();
+        bool parseScenecutAwareQpParam(int argc, char **argv, x265_param* globalParam);
     };
 #ifdef __cplusplus
 }

 
@@ -135,6 +135,7 @@
     { "no-fast-intra",        no_argument, NULL, 0 },
     { "no-open-gop",          no_argument, NULL, 0 },
     { "open-gop",             no_argument, NULL, 0 },
+    { "cra-nal",              no_argument, NULL, 0 },
     { "keyint",         required_argument, NULL, 'I' },
     { "min-keyint",     required_argument, NULL, 'i' },
     { "gop-lookahead",  required_argument, NULL, 0 },
@@ -143,7 +144,6 @@
     { "scenecut-bias",  required_argument, NULL, 0 },
     { "hist-scenecut",        no_argument, NULL, 0},
     { "no-hist-scenecut",     no_argument, NULL, 0},
-    { "hist-threshold", required_argument, NULL, 0},
     { "fades",                no_argument, NULL, 0 },
     { "no-fades",             no_argument, NULL, 0 },
     { "scenecut-aware-qp", required_argument, NULL, 0 },
@@ -182,6 +182,8 @@
     { "qp",             required_argument, NULL, 'q' },
     { "aq-mode",        required_argument, NULL, 0 },
     { "aq-strength",    required_argument, NULL, 0 },
+    { "sbrc",                 no_argument, NULL, 0 },
+    { "no-sbrc",              no_argument, NULL, 0 },
     { "rc-grain",             no_argument, NULL, 0 },
     { "no-rc-grain",          no_argument, NULL, 0 },
     { "ipratio",        required_argument, NULL, 0 },
@@ -244,6 +246,7 @@
     { "crop-rect",      required_argument, NULL, 0 }, /* DEPRECATED */
     { "master-display", required_argument, NULL, 0 },
     { "max-cll",        required_argument, NULL, 0 },
+    {"video-signal-type-preset", required_argument, NULL, 0 },
     { "min-luma",       required_argument, NULL, 0 },
     { "max-luma",       required_argument, NULL, 0 },
     { "log2-max-poc-lsb", required_argument, NULL, 8 },
@@ -263,11 +266,16 @@
     { "repeat-headers",       no_argument, NULL, 0 },
     { "aud",                  no_argument, NULL, 0 },
     { "no-aud",               no_argument, NULL, 0 },
+    { "eob",                  no_argument, NULL, 0 },
+    { "no-eob",               no_argument, NULL, 0 },
+    { "eos",                  no_argument, NULL, 0 },
+    { "no-eos",               no_argument, NULL, 0 },
     { "info",                 no_argument, NULL, 0 },
     { "no-info",              no_argument, NULL, 0 },
     { "zones",          required_argument, NULL, 0 },
     { "qpfile",         required_argument, NULL, 0 },
     { "zonefile",       required_argument, NULL, 0 },
+    { "no-zonefile-rc-init",  no_argument, NULL, 0 },
     { "lambda-file",    required_argument, NULL, 0 },
     { "b-intra",              no_argument, NULL, 0 },
     { "no-b-intra",           no_argument, NULL, 0 },
@@ -298,8 +306,7 @@
     { "dynamic-refine",       no_argument, NULL, 0 },
     { "no-dynamic-refine",    no_argument, NULL, 0 },
     { "strict-cbr",           no_argument, NULL, 0 },
-    { "temporal-layers",      no_argument, NULL, 0 },
-    { "no-temporal-layers",   no_argument, NULL, 0 },
+    { "temporal-layers",      required_argument, NULL, 0 },
     { "qg-size",        required_argument, NULL, 0 },
     { "recon-y4m-exec", required_argument, NULL, 0 },
     { "analyze-src-pics", no_argument, NULL, 0 },
@@ -349,6 +356,8 @@
     { "frame-dup",            no_argument, NULL, 0 },
     { "no-frame-dup", no_argument, NULL, 0 },
     { "dup-threshold", required_argument, NULL, 0 },
+    { "mcstf",                 no_argument, NULL, 0 },
+    { "no-mcstf",              no_argument, NULL, 0 },
 #ifdef SVT_HEVC
     { "svt",     no_argument, NULL, 0 },
     { "no-svt",  no_argument, NULL, 0 },
@@ -373,6 +382,8 @@
     { "abr-ladder", required_argument, NULL, 0 },
     { "min-vbv-fullness", required_argument, NULL, 0 },
     { "max-vbv-fullness", required_argument, NULL, 0 },
+    { "scenecut-qp-config", required_argument, NULL, 0 },
+    { "film-grain", required_argument, NULL, 0 },
     { 0, 0, 0, 0 },
     { 0, 0, 0, 0 },
     { 0, 0, 0, 0 },
@@ -388,6 +399,7 @@
         FILE*       qpfile;
         FILE*       zoneFile;
         FILE*    dolbyVisionRpu;    /* File containing Dolby Vision BL RPU metadata */
+        FILE*    scenecutAwareQpConfig; /* File containing scenecut aware frame quantization related CLI options */
         const char* reconPlayCmd;
         const x265_api* api;
         x265_param* param;
@@ -425,6 +437,7 @@
             qpfile = NULL;
             zoneFile = NULL;
             dolbyVisionRpu = NULL;
+            scenecutAwareQpConfig = NULL;
             reconPlayCmd = NULL;
             api = NULL;
             param = NULL;
@@ -455,6 +468,8 @@
         bool parseQPFile(x265_picture &pic_org);
         bool parseZoneFile();
         int rpuParser(x265_picture * pic);
+        bool parseScenecutAwareQpConfig();
+        bool parseScenecutAwareQpParam(int argc, char **argv, x265_param* globalParam);
     };
 #ifdef __cplusplus
 }
​

x265_3.5.tar.gz/x265Version.txt -> x265_3.6.tar.gz/x265Version.txt Changed

 
@@ -1,4 +1,4 @@
 #Attribute:         Values
-repositorychangeset: f0c1022b6
+repositorychangeset: aa7f602f7
 releasetagdistance: 1
-releasetag: 3.5
+releasetag: 3.6
​