Packman Build Service PMBS

We truncated the diff of some files because they were too big. If you want to see the full diff for every file, click here.

Changes of Revision 39

x265.changes Changed

@@ -1,4 +1,40 @@
 -------------------------------------------------------------------
+Mon Jun  1 17:51:22 UTC 2020 - Luigi Baldoni <aloisio@gmx.com>
+
+- Update to version 3.4
+  New features:
+  * Edge-aware quadtree partitioning to terminate CU depth
+    recursion based on edge information. --rskip level 2 enables
+    the feature and --rskip-edge-threshold denotes the minimum
+    expected edge-density percentage within the CU, below which
+    the recursion is skipped. Experimental feature.
+  * Application-level feature --abr-ladder for automating
+    efficient ABR ladder generation. Shows ~65% savings in the
+    over-all turn-around time required for the generation of a
+    typical Apple HLS ladder in Intel(R) Xeon(R) Platinum 8280
+    CPU @ 2.70GHz over a sequential ABR-ladder generation
+    approach that leverages save-load architecture.
+  Enhancements to existing features:
+  * Improved efficiency in 2-pass rate-control algorithm. The
+    savings in the bitrate is ~1.72% with visual improvement in
+    quality in the initial 1-2 secs.
+  Encoder enhancements:
+  * Faster ARM64 encodes enabled by ASM contributions from
+    Huawei. The speed-up over no-asm version for 1080p encodes @
+    medium preset is ~15% in a 16 core H/W.
+  * Strict VBV conformance in zone encoding.
+  Bug fixes:
+  * Multi-pass encode failures with --frame-dup.
+  * Corrupted bitstreams with --hist-scenecut when input depth
+    and internal bit-depth differ.
+  * Incorrect analysis propagation in multi-level save-load
+    architecture.
+  * Failure in detecting NUMA packages installed in non-standard
+    directories.
+
+- Refreshed arm.patch
+
+-------------------------------------------------------------------
 Sat Mar 28 14:28:56 UTC 2020 - Luigi Baldoni <aloisio@gmx.com>
 
 - Update to version 3.3

​x
 
@@ -1,4 +1,40 @@
 -------------------------------------------------------------------
+Mon Jun  1 17:51:22 UTC 2020 - Luigi Baldoni <aloisio@gmx.com>
+
+- Update to version 3.4
+  New features:
+  * Edge-aware quadtree partitioning to terminate CU depth
+    recursion based on edge information. --rskip level 2 enables
+    the feature and --rskip-edge-threshold denotes the minimum
+    expected edge-density percentage within the CU, below which
+    the recursion is skipped. Experimental feature.
+  * Application-level feature --abr-ladder for automating
+    efficient ABR ladder generation. Shows ~65% savings in the
+    over-all turn-around time required for the generation of a
+    typical Apple HLS ladder in Intel(R) Xeon(R) Platinum 8280
+    CPU @ 2.70GHz over a sequential ABR-ladder generation
+    approach that leverages save-load architecture.
+  Enhancements to existing features:
+  * Improved efficiency in 2-pass rate-control algorithm. The
+    savings in the bitrate is ~1.72% with visual improvement in
+    quality in the initial 1-2 secs.
+  Encoder enhancements:
+  * Faster ARM64 encodes enabled by ASM contributions from
+    Huawei. The speed-up over no-asm version for 1080p encodes @
+    medium preset is ~15% in a 16 core H/W.
+  * Strict VBV conformance in zone encoding.
+  Bug fixes:
+  * Multi-pass encode failures with --frame-dup.
+  * Corrupted bitstreams with --hist-scenecut when input depth
+    and internal bit-depth differ.
+  * Incorrect analysis propagation in multi-level save-load
+    architecture.
+  * Failure in detecting NUMA packages installed in non-standard
+    directories.
+
+- Refreshed arm.patch
+
+-------------------------------------------------------------------
 Sat Mar 28 14:28:56 UTC 2020 - Luigi Baldoni <aloisio@gmx.com>
 
 - Update to version 3.3
​

x265.spec Changed

 
@@ -17,11 +17,11 @@
 #
 
 
-%define sover  188
+%define sover  192
 %define libname lib%{name}
 %define libsoname %{libname}-%{sover}
 Name:           x265
-Version:        3.3
+Version:        3.4
 Release:        0
 Summary:        A free h265/HEVC encoder - encoder binary
 License:        GPL-2.0-or-later
@@ -67,7 +67,6 @@
 %patch0 -p1
 %patch1 -p1
 %patch2 -p1
-
 sed -i -e "s/0.0/%{sover}.0/g" source/cmake/version.cmake
 
 
​

arm.patch Changed

@@ -1,8 +1,8 @@
-Index: x265_2.2/source/CMakeLists.txt
+Index: x265_3.4/source/CMakeLists.txt
 ===================================================================
---- x265_2.2.orig/source/CMakeLists.txt
-+++ x265_2.2/source/CMakeLists.txt
-@@ -65,15 +65,22 @@ elseif(POWERMATCH GREATER "-1")
+--- x265_3.4.orig/source/CMakeLists.txt
++++ x265_3.4/source/CMakeLists.txt
+@@ -64,26 +64,26 @@ elseif(POWERMATCH GREATER "-1")
          add_definitions(-DPPC64=1)
          message(STATUS "Detected POWER PPC64 target processor")
      endif()
@@ -12,41 +12,62 @@
 -    else()
 -        set(CROSS_COMPILE_ARM 0)
 -    endif()
--    message(STATUS "Detected ARM target processor")
 -    set(ARM 1)
--    add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=1)
+-    if("${CMAKE_SIZEOF_VOID_P}" MATCHES 8)
+-        message(STATUS "Detected ARM64 target processor")
+-        set(ARM64 1)
+-        add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=1 -DHAVE_ARMV6=0)
+-    else()
+-        message(STATUS "Detected ARM target processor")
+-        add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=0 -DHAVE_ARMV6=1)
+-    endif()
 +elseif(${SYSPROC} MATCHES "armv5.*")
 +    message(STATUS "Detected ARMV5 system processor")
 +    set(ARMV5 1)
-+    add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=0 -DHAVE_NEON=0)
++    add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=0 -DHAVE_ARMV6=0 -DHAVE_NEON=0)
 +elseif(${SYSPROC} STREQUAL "armv6l")
 +    message(STATUS "Detected ARMV6 system processor")
 +    set(ARMV6 1)
-+    add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=1 -DHAVE_NEON=0)
++    add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=0 -DHAVE_ARMV6=1 -DHAVE_NEON=0)
 +elseif(${SYSPROC} STREQUAL "armv7l")
 +    message(STATUS "Detected ARMV7 system processor")
 +    set(ARMV7 1)
-+    add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=1 -DHAVE_NEON=0)
++    add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=0 -DHAVE_ARMV6=1 -DHAVE_NEON=0)
 +elseif(${SYSPROC} STREQUAL "aarch64")
 +    message(STATUS "Detected AArch64 system processor")
 +    set(ARMV7 1)
-+    add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=1 -DHAVE_NEON=0)
++    add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=1 -DHAVE_ARMV6=0 -DHAVE_NEON=0)
  else()
      message(STATUS "CMAKE_SYSTEM_PROCESSOR value `${CMAKE_SYSTEM_PROCESSOR}` is unknown")
      message(STATUS "Please add this value near ${CMAKE_CURRENT_LIST_FILE}:${CMAKE_CURRENT_LIST_LINE}")
-@@ -208,18 +215,9 @@ if(GCC)
+ endif()
+-
+ if(UNIX)
+     list(APPEND PLATFORM_LIBS pthread)
+     find_library(LIBRT rt)
+@@ -238,28 +238,9 @@ if(GCC)
              endif()
          endif()
      endif()
 -    if(ARM AND CROSS_COMPILE_ARM)
--        set(ARM_ARGS -march=armv6 -mfloat-abi=soft -mfpu=vfp -marm -fPIC)
+-        if(ARM64)
+-            set(ARM_ARGS -fPIC)
+-        else()
+-            set(ARM_ARGS -march=armv6 -mfloat-abi=soft -mfpu=vfp -marm -fPIC)
+-        endif()
+-        message(STATUS "cross compile arm")
 -    elseif(ARM)
--        find_package(Neon)
--        if(CPU_HAS_NEON)
--            set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=neon -marm -fPIC)
+-        if(ARM64)
+-            set(ARM_ARGS -fPIC)
 -            add_definitions(-DHAVE_NEON)
 -        else()
--            set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=vfp -marm)
+-            find_package(Neon)
+-            if(CPU_HAS_NEON)
+-                set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=neon -marm -fPIC)
+-                add_definitions(-DHAVE_NEON)
+-            else()
+-                set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=vfp -marm)
+-            endif()
 -        endif()
 +    if(ARMV7)
 +        add_definitions(-fPIC)
@@ -55,11 +76,11 @@
      if(FPROFILE_GENERATE)
          if(INTEL_CXX)
              add_definitions(-prof-gen -prof-dir="${CMAKE_CURRENT_BINARY_DIR}")
-Index: x265_2.2/source/common/cpu.cpp
+Index: x265_3.4/source/common/cpu.cpp
 ===================================================================
---- x265_2.2.orig/source/common/cpu.cpp
-+++ x265_2.2/source/common/cpu.cpp
-@@ -37,7 +37,7 @@
+--- x265_3.4.orig/source/common/cpu.cpp
++++ x265_3.4/source/common/cpu.cpp
+@@ -39,7 +39,7 @@
  #include <machine/cpu.h>
  #endif
  
@@ -68,7 +89,7 @@
  #include <signal.h>
  #include <setjmp.h>
  static sigjmp_buf jmpbuf;
-@@ -344,7 +344,6 @@ uint32_t cpu_detect(void)
+@@ -350,7 +350,6 @@ uint32_t cpu_detect(bool benableavx512)
      }
  
      canjump = 1;
@@ -76,7 +97,7 @@
      canjump = 0;
      signal(SIGILL, oldsig);
  #endif // if !HAVE_NEON
-@@ -360,7 +359,7 @@ uint32_t cpu_detect(void)
+@@ -366,7 +365,7 @@ uint32_t cpu_detect(bool benableavx512)
      // which may result in incorrect detection and the counters stuck enabled.
      // right now Apple does not seem to support performance counters for this test
  #ifndef __MACH__
@@ -84,4 +105,4 @@
 +    //flags |= PFX(cpu_fast_neon_mrc_test)() ? X265_CPU_FAST_NEON_MRC : 0;
  #endif
      // TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc)
- #endif // if HAVE_ARMV6
+ #elif X265_ARCH_ARM64

 
@@ -1,8 +1,8 @@
-Index: x265_2.2/source/CMakeLists.txt
+Index: x265_3.4/source/CMakeLists.txt
 ===================================================================
---- x265_2.2.orig/source/CMakeLists.txt
-+++ x265_2.2/source/CMakeLists.txt
-@@ -65,15 +65,22 @@ elseif(POWERMATCH GREATER "-1")
+--- x265_3.4.orig/source/CMakeLists.txt
++++ x265_3.4/source/CMakeLists.txt
+@@ -64,26 +64,26 @@ elseif(POWERMATCH GREATER "-1")
          add_definitions(-DPPC64=1)
          message(STATUS "Detected POWER PPC64 target processor")
      endif()
@@ -12,41 +12,62 @@
 -    else()
 -        set(CROSS_COMPILE_ARM 0)
 -    endif()
--    message(STATUS "Detected ARM target processor")
 -    set(ARM 1)
--    add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=1)
+-    if("${CMAKE_SIZEOF_VOID_P}" MATCHES 8)
+-        message(STATUS "Detected ARM64 target processor")
+-        set(ARM64 1)
+-        add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=1 -DHAVE_ARMV6=0)
+-    else()
+-        message(STATUS "Detected ARM target processor")
+-        add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=0 -DHAVE_ARMV6=1)
+-    endif()
 +elseif(${SYSPROC} MATCHES "armv5.*")
 +    message(STATUS "Detected ARMV5 system processor")
 +    set(ARMV5 1)
-+    add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=0 -DHAVE_NEON=0)
++    add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=0 -DHAVE_ARMV6=0 -DHAVE_NEON=0)
 +elseif(${SYSPROC} STREQUAL "armv6l")
 +    message(STATUS "Detected ARMV6 system processor")
 +    set(ARMV6 1)
-+    add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=1 -DHAVE_NEON=0)
++    add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=0 -DHAVE_ARMV6=1 -DHAVE_NEON=0)
 +elseif(${SYSPROC} STREQUAL "armv7l")
 +    message(STATUS "Detected ARMV7 system processor")
 +    set(ARMV7 1)
-+    add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=1 -DHAVE_NEON=0)
++    add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=0 -DHAVE_ARMV6=1 -DHAVE_NEON=0)
 +elseif(${SYSPROC} STREQUAL "aarch64")
 +    message(STATUS "Detected AArch64 system processor")
 +    set(ARMV7 1)
-+    add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=1 -DHAVE_NEON=0)
++    add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=1 -DHAVE_ARMV6=0 -DHAVE_NEON=0)
  else()
      message(STATUS "CMAKE_SYSTEM_PROCESSOR value `${CMAKE_SYSTEM_PROCESSOR}` is unknown")
      message(STATUS "Please add this value near ${CMAKE_CURRENT_LIST_FILE}:${CMAKE_CURRENT_LIST_LINE}")
-@@ -208,18 +215,9 @@ if(GCC)
+ endif()
+-
+ if(UNIX)
+     list(APPEND PLATFORM_LIBS pthread)
+     find_library(LIBRT rt)
+@@ -238,28 +238,9 @@ if(GCC)
              endif()
          endif()
      endif()
 -    if(ARM AND CROSS_COMPILE_ARM)
--        set(ARM_ARGS -march=armv6 -mfloat-abi=soft -mfpu=vfp -marm -fPIC)
+-        if(ARM64)
+-            set(ARM_ARGS -fPIC)
+-        else()
+-            set(ARM_ARGS -march=armv6 -mfloat-abi=soft -mfpu=vfp -marm -fPIC)
+-        endif()
+-        message(STATUS "cross compile arm")
 -    elseif(ARM)
--        find_package(Neon)
--        if(CPU_HAS_NEON)
--            set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=neon -marm -fPIC)
+-        if(ARM64)
+-            set(ARM_ARGS -fPIC)
 -            add_definitions(-DHAVE_NEON)
 -        else()
--            set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=vfp -marm)
+-            find_package(Neon)
+-            if(CPU_HAS_NEON)
+-                set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=neon -marm -fPIC)
+-                add_definitions(-DHAVE_NEON)
+-            else()
+-                set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=vfp -marm)
+-            endif()
 -        endif()
 +    if(ARMV7)
 +        add_definitions(-fPIC)
@@ -55,11 +76,11 @@
      if(FPROFILE_GENERATE)
          if(INTEL_CXX)
              add_definitions(-prof-gen -prof-dir="${CMAKE_CURRENT_BINARY_DIR}")
-Index: x265_2.2/source/common/cpu.cpp
+Index: x265_3.4/source/common/cpu.cpp
 ===================================================================
---- x265_2.2.orig/source/common/cpu.cpp
-+++ x265_2.2/source/common/cpu.cpp
-@@ -37,7 +37,7 @@
+--- x265_3.4.orig/source/common/cpu.cpp
++++ x265_3.4/source/common/cpu.cpp
+@@ -39,7 +39,7 @@
  #include <machine/cpu.h>
  #endif
  
@@ -68,7 +89,7 @@
  #include <signal.h>
  #include <setjmp.h>
  static sigjmp_buf jmpbuf;
-@@ -344,7 +344,6 @@ uint32_t cpu_detect(void)
+@@ -350,7 +350,6 @@ uint32_t cpu_detect(bool benableavx512)
      }
  
      canjump = 1;
@@ -76,7 +97,7 @@
      canjump = 0;
      signal(SIGILL, oldsig);
  #endif // if !HAVE_NEON
-@@ -360,7 +359,7 @@ uint32_t cpu_detect(void)
+@@ -366,7 +365,7 @@ uint32_t cpu_detect(bool benableavx512)
      // which may result in incorrect detection and the counters stuck enabled.
      // right now Apple does not seem to support performance counters for this test
  #ifndef __MACH__
@@ -84,4 +105,4 @@
 +    //flags |= PFX(cpu_fast_neon_mrc_test)() ? X265_CPU_FAST_NEON_MRC : 0;
  #endif
      // TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc)
- #endif // if HAVE_ARMV6
+ #elif X265_ARCH_ARM64
​

baselibs.conf Changed

 
@@ -1,1 +1,1 @@
-libx265-179
+libx265-192
​

x265_3.3.tar.gz/.hg_archival.txt -> x265_3.4.tar.gz/.hg_archival.txt Changed

 
@@ -1,5 +1,4 @@
 repo: 09fe40627f03a0f9c3e6ac78b22ac93da23f9fdf
-node: f94b0d32737d40b2b9a9d74df57fee45e6be5cb0
-branch: Release_3.3
-latesttag: 3.3
-latesttagdistance: 1
+node: 2a65b720985096bcb1664f7cb05c3d04aeb576f5
+branch: Release_3.4
+tag: 3.4
​

x265_3.3.tar.gz/.hgtags -> x265_3.4.tar.gz/.hgtags Changed

 
@@ -40,3 +40,4 @@
 5ee3593ebd82b4d8957909bbc1b68b99b59ba773 3.3_RC1
 96a10df63c0b778b480330bdf3be8da7db8a5fb1 3.3_RC2
 057215961bc4b51b6260a584ff3d506e6d65cfd6 3.3
+ee92f36782800f145970131e01c79955a3ed5c10 3.4_RC1
​

x265_3.4.tar.gz/build/aarch64-linux/crosscompile.cmake Added

 
@@ -0,0 +1,15 @@
+# CMake toolchain file for cross compiling x265 for aarch64
+# This feature is only supported as experimental. Use with caution.
+# Please report bugs on bitbucket
+# Run cmake with: cmake -DCMAKE_TOOLCHAIN_FILE=crosscompile.cmake -G "Unix Makefiles" ../../source && ccmake ../../source
+
+set(CROSS_COMPILE_ARM 1)
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_PROCESSOR aarch64)
+
+# specify the cross compiler
+set(CMAKE_C_COMPILER aarch64-linux-gnu-gcc)
+set(CMAKE_CXX_COMPILER aarch64-linux-gnu-g++)
+
+# specify the target environment
+SET(CMAKE_FIND_ROOT_PATH  /usr/aarch64-linux-gnu)
​

x265_3.4.tar.gz/build/aarch64-linux/make-Makefiles.bash Added

 
@@ -0,0 +1,4 @@
+#!/bin/bash
+# Run this from within a bash shell
+
+cmake -DCMAKE_TOOLCHAIN_FILE="crosscompile.cmake" -G "Unix Makefiles" ../../source && ccmake ../../source
​

x265_3.3.tar.gz/doc/reST/cli.rst -> x265_3.4.tar.gz/doc/reST/cli.rst Changed

@@ -107,6 +107,9 @@
 	
 	**BufferFillFinal** Buffer bits available after removing the frame out of CPB.
 	
+	**UnclippedBufferFillFinal** Unclipped buffer bits available after removing the frame 
+	out of CPB only used for csv logging purpose.
+	
 	**Latency** Latency in terms of number of frames between when the frame 
 	was given in and when the frame is given out.
 	
@@ -842,15 +845,31 @@
 	Measure 2Nx2N merge candidates first; if no residual is found, 
 	additional modes at that depth are not analysed. Default disabled
 
-.. option:: --rskip, --no-rskip
+.. option:: --rskip <0|1|2>
+
+	This option determines early exit from CU depth recursion in modes 1 and 2. When a skip CU is
+	found, additional heuristics (depending on the RD level and rskip mode) are used to decide whether
+	to terminate recursion. The following table summarizes the behavior.
+	
+	+----------+------------+----------------------------------------------------------------+
+	| RD Level | Rskip Mode |   Skip Recursion Heuristic                                     |
+	+==========+============+================================================================+
+	|   0 - 4  |      1     |   Neighbour costs and CU homogenity.                           |
+	+----------+------------+----------------------------------------------------------------+
+	|   5 - 6  |      1     |   Comparison with inter2Nx2N.                                  |
+	+----------+------------+----------------------------------------------------------------+
+	|   0 - 6  |      2     |   CU edge density.                                             |
+	+----------+------------+----------------------------------------------------------------+
+
+	Provides minimal quality degradation at good performance gains for non-zero modes.
+	:option:`--rskip mode 0` means disabled. Default: 1, disabled when :option:`--tune grain` is used.
+	This is a integer value representing the edge-density percentage within the CU. Internally normalized to a number between 0.0 to 1.0 in x265. 
+	Recommended low thresholds for slow encodes and high for fast encodes.
 
-	This option determines early exit from CU depth recursion. When a skip CU is
-	found, additional heuristics (depending on rd-level) are used to decide whether
-	to terminate recursion. In rdlevels 5 and 6, comparison with inter2Nx2N is used, 
-	while at rdlevels 4 and neighbour costs are used to skip recursion.
-	Provides minimal quality degradation at good performance gains when enabled. 
+.. option:: --rskip-edge-threshold <0..100>
 
-	Default: enabled, disabled for :option:`--tune grain`
+	Denotes the minimum expected edge-density percentage within the CU, below which the recursion is skipped.
+	Default: 5, requires :option:`--rskip mode 2` to be enabled.
 
 .. option:: --splitrd-skip, --no-splitrd-skip
 
@@ -2501,6 +2520,28 @@
 	--recon-y4m-exec "ffplay -i pipe:0 -autoexit"
 
 	**CLI ONLY**
+	
+ABR-ladder Options
+==================
+
+.. option:: --abr-ladder <filename>
+
+	File containing the encoder configurations to generate ABR ladder.
+	The format of each line is:
+
+	**<encID:reuse-level:refID> <CLI>**
+	
+	where, encID indicates the unique name given to the encode, refID indicates
+	the name of the encode from which analysis info has to be re-used ( set to 'nil'
+	if analysis reuse isn't preferred ), and reuse-level indicates the level ( :option:`--analysis-load-reuse-level`)
+	at which analysis info has to be reused.
+	
+	A sample config file is available in `the downloads page <https://bitbucket.org/multicoreware/x265/downloads/Sample_ABR_ladder_config>`_
+	
+	Default: Disabled ( Conventional single encode generation ). Experimental feature.
+
+	**CLI ONLY**
+
 
 SVT-HEVC Encoder Options
 ========================

 
@@ -107,6 +107,9 @@
    
    **BufferFillFinal** Buffer bits available after removing the frame out of CPB.
    
+   **UnclippedBufferFillFinal** Unclipped buffer bits available after removing the frame 
+   out of CPB only used for csv logging purpose.
+   
    **Latency** Latency in terms of number of frames between when the frame 
    was given in and when the frame is given out.
    
@@ -842,15 +845,31 @@
    Measure 2Nx2N merge candidates first; if no residual is found, 
    additional modes at that depth are not analysed. Default disabled
 
-.. option:: --rskip, --no-rskip
+.. option:: --rskip <0|1|2>
+
+   This option determines early exit from CU depth recursion in modes 1 and 2. When a skip CU is
+   found, additional heuristics (depending on the RD level and rskip mode) are used to decide whether
+   to terminate recursion. The following table summarizes the behavior.
+   
+   +----------+------------+----------------------------------------------------------------+
+   | RD Level | Rskip Mode |   Skip Recursion Heuristic                                     |
+   +==========+============+================================================================+
+   |   0 - 4  |      1     |   Neighbour costs and CU homogenity.                           |
+   +----------+------------+----------------------------------------------------------------+
+   |   5 - 6  |      1     |   Comparison with inter2Nx2N.                                  |
+   +----------+------------+----------------------------------------------------------------+
+   |   0 - 6  |      2     |   CU edge density.                                             |
+   +----------+------------+----------------------------------------------------------------+
+
+   Provides minimal quality degradation at good performance gains for non-zero modes.
+   :option:`--rskip mode 0` means disabled. Default: 1, disabled when :option:`--tune grain` is used.
+   This is a integer value representing the edge-density percentage within the CU. Internally normalized to a number between 0.0 to 1.0 in x265. 
+   Recommended low thresholds for slow encodes and high for fast encodes.
 
-   This option determines early exit from CU depth recursion. When a skip CU is
-   found, additional heuristics (depending on rd-level) are used to decide whether
-   to terminate recursion. In rdlevels 5 and 6, comparison with inter2Nx2N is used, 
-   while at rdlevels 4 and neighbour costs are used to skip recursion.
-   Provides minimal quality degradation at good performance gains when enabled. 
+.. option:: --rskip-edge-threshold <0..100>
 
-   Default: enabled, disabled for :option:`--tune grain`
+   Denotes the minimum expected edge-density percentage within the CU, below which the recursion is skipped.
+   Default: 5, requires :option:`--rskip mode 2` to be enabled.
 
 .. option:: --splitrd-skip, --no-splitrd-skip
 
@@ -2501,6 +2520,28 @@
    --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
 
    **CLI ONLY**
+   
+ABR-ladder Options
+==================
+
+.. option:: --abr-ladder <filename>
+
+   File containing the encoder configurations to generate ABR ladder.
+   The format of each line is:
+
+   **<encID:reuse-level:refID> <CLI>**
+   
+   where, encID indicates the unique name given to the encode, refID indicates
+   the name of the encode from which analysis info has to be re-used ( set to 'nil'
+   if analysis reuse isn't preferred ), and reuse-level indicates the level ( :option:`--analysis-load-reuse-level`)
+   at which analysis info has to be reused.
+   
+   A sample config file is available in `the downloads page <https://bitbucket.org/multicoreware/x265/downloads/Sample_ABR_ladder_config>`_
+   
+   Default: Disabled ( Conventional single encode generation ). Experimental feature.
+
+   **CLI ONLY**
+
 
 SVT-HEVC Encoder Options
 ========================
​

x265_3.3.tar.gz/doc/reST/releasenotes.rst -> x265_3.4.tar.gz/doc/reST/releasenotes.rst Changed

@@ -2,6 +2,32 @@
 Release Notes
 *************
 
+Version 3.4
+===========
+
+Release date - 29th May, 2020.
+
+New features
+------------
+1. **Edge-aware quadtree partitioning** to terminate CU depth recursion based on edge information. :option:`--rskip` level 2 enables the feature and  :option:`--rskip-edge-threshold` denotes the minimum expected edge-density percentage within the CU, below which the recursion is skipped. Experimental feature.
+2. Application-level feature :option:`--abr-ladder` for automating efficient ABR ladder generation. Shows ~65% savings in the over-all turn-around time required for the generation of a typical Apple HLS ladder in Intel(R) Xeon(R) Platinum 8280 CPU @ 2.70GHz over a sequential ABR-ladder generation approach that leverages save-load architecture.
+
+Enhancements to existing features
+---------------------------------
+1. Improved efficiency in 2-pass rate-control algorithm. The savings in the bitrate is ~1.72% with visual improvement in quality in the initial 1-2 secs.
+
+Encoder enhancements
+--------------------
+1. Faster ARM64 encodes enabled by ASM contributions from Huawei. The speed-up over no-asm version for 1080p encodes @ medium preset is ~15% in a 16 core H/W.
+2. Strict VBV conformance in zone encoding.
+
+Bug fixes
+---------
+1. Multi-pass encode failures with :option:`--frame-dup`.
+2. Corrupted bitstreams with :option:`--hist-scenecut` when input depth and internal bit-depth differ.
+3. Incorrect analysis propagation in multi-level save-load architecture.
+4. Failure in detecting NUMA packages installed in non-standard directories.
+
 Version 3.3
 ===========

 
@@ -2,6 +2,32 @@
 Release Notes
 *************
 
+Version 3.4
+===========
+
+Release date - 29th May, 2020.
+
+New features
+------------
+1. **Edge-aware quadtree partitioning** to terminate CU depth recursion based on edge information. :option:`--rskip` level 2 enables the feature and  :option:`--rskip-edge-threshold` denotes the minimum expected edge-density percentage within the CU, below which the recursion is skipped. Experimental feature.
+2. Application-level feature :option:`--abr-ladder` for automating efficient ABR ladder generation. Shows ~65% savings in the over-all turn-around time required for the generation of a typical Apple HLS ladder in Intel(R) Xeon(R) Platinum 8280 CPU @ 2.70GHz over a sequential ABR-ladder generation approach that leverages save-load architecture.
+
+Enhancements to existing features
+---------------------------------
+1. Improved efficiency in 2-pass rate-control algorithm. The savings in the bitrate is ~1.72% with visual improvement in quality in the initial 1-2 secs.
+
+Encoder enhancements
+--------------------
+1. Faster ARM64 encodes enabled by ASM contributions from Huawei. The speed-up over no-asm version for 1080p encodes @ medium preset is ~15% in a 16 core H/W.
+2. Strict VBV conformance in zone encoding.
+
+Bug fixes
+---------
+1. Multi-pass encode failures with :option:`--frame-dup`.
+2. Corrupted bitstreams with :option:`--hist-scenecut` when input depth and internal bit-depth differ.
+3. Incorrect analysis propagation in multi-level save-load architecture.
+4. Failure in detecting NUMA packages installed in non-standard directories.
+
 Version 3.3
 ===========
 
​

x265_3.3.tar.gz/source/CMakeLists.txt -> x265_3.4.tar.gz/source/CMakeLists.txt Changed

@@ -29,7 +29,7 @@
 option(STATIC_LINK_CRT "Statically link C runtime for release builds" OFF)
 mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD)
 # X265_BUILD must be incremented each time the public API is changed
-set(X265_BUILD 188)
+set(X265_BUILD 192)
 configure_file("${PROJECT_SOURCE_DIR}/x265.def.in"
                "${PROJECT_BINARY_DIR}/x265.def")
 configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in"
@@ -40,7 +40,7 @@
 # System architecture detection
 string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" SYSPROC)
 set(X86_ALIASES x86 i386 i686 x86_64 amd64)
-set(ARM_ALIASES armv6l armv7l)
+set(ARM_ALIASES armv6l armv7l aarch64)
 list(FIND X86_ALIASES "${SYSPROC}" X86MATCH)
 list(FIND ARM_ALIASES "${SYSPROC}" ARMMATCH)
 set(POWER_ALIASES ppc64 ppc64le)
@@ -70,9 +70,15 @@
     else()
         set(CROSS_COMPILE_ARM 0)
     endif()
-    message(STATUS "Detected ARM target processor")
     set(ARM 1)
-    add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=1)
+    if("${CMAKE_SIZEOF_VOID_P}" MATCHES 8)
+        message(STATUS "Detected ARM64 target processor")
+        set(ARM64 1)
+        add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=1 -DHAVE_ARMV6=0)
+    else()
+        message(STATUS "Detected ARM target processor")
+        add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=0 -DHAVE_ARMV6=1)
+    endif()
 else()
     message(STATUS "CMAKE_SYSTEM_PROCESSOR value `${CMAKE_SYSTEM_PROCESSOR}` is unknown")
     message(STATUS "Please add this value near ${CMAKE_CURRENT_LIST_FILE}:${CMAKE_CURRENT_LIST_LINE}")
@@ -95,6 +101,8 @@
         if(NUMA_FOUND)
             link_directories(${NUMA_LIBRARY_DIR})
             list(APPEND CMAKE_REQUIRED_LIBRARIES numa)
+            list(APPEND CMAKE_REQUIRED_INCLUDES ${NUMA_INCLUDE_DIR})
+            list(APPEND CMAKE_REQUIRED_LINK_OPTIONS "-L${NUMA_LIBRARY_DIR}")
             check_symbol_exists(numa_node_of_cpu numa.h NUMA_V2)
             if(NUMA_V2)
                 add_definitions(-DHAVE_LIBNUMA)
@@ -231,14 +239,24 @@
         endif()
     endif()
     if(ARM AND CROSS_COMPILE_ARM)
-        set(ARM_ARGS -march=armv6 -mfloat-abi=soft -mfpu=vfp -marm -fPIC)
+        if(ARM64)
+            set(ARM_ARGS -fPIC)
+        else()
+            set(ARM_ARGS -march=armv6 -mfloat-abi=soft -mfpu=vfp -marm -fPIC)
+        endif()
+        message(STATUS "cross compile arm")
     elseif(ARM)
-        find_package(Neon)
-        if(CPU_HAS_NEON)
-            set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=neon -marm -fPIC)
+        if(ARM64)
+            set(ARM_ARGS -fPIC)
             add_definitions(-DHAVE_NEON)
         else()
-            set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=vfp -marm)
+            find_package(Neon)
+            if(CPU_HAS_NEON)
+                set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=neon -marm -fPIC)
+                add_definitions(-DHAVE_NEON)
+            else()
+                set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=vfp -marm)
+            endif()
         endif()
     endif()
     add_definitions(${ARM_ARGS})
@@ -518,7 +536,11 @@
     # compile ARM arch asm files here
         enable_language(ASM)
         foreach(ASM ${ARM_ASMS})
-            set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/arm/${ASM})
+            if(ARM64)
+                set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/aarch64/${ASM})
+            else()
+                set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/arm/${ASM})
+            endif()
             list(APPEND ASM_SRCS ${ASM_SRC})
             list(APPEND ASM_OBJS ${ASM}.${SUFFIX})
             add_custom_command(
@@ -725,16 +747,16 @@
         # Xcode seems unable to link the CLI with libs, so link as one targget
         if(ENABLE_HDR10_PLUS)
         add_executable(cli ../COPYING ${InputFiles} ${OutputFiles} ${GETOPT}
-                        x265.cpp x265.h x265cli.h
+                        x265.cpp x265.h x265cli.cpp x265cli.h abrEncApp.cpp abrEncApp.h
                         $<TARGET_OBJECTS:encoder> $<TARGET_OBJECTS:common> $<TARGET_OBJECTS:dynamicHDR10> ${ASM_OBJS})
         else()
             add_executable(cli ../COPYING ${InputFiles} ${OutputFiles} ${GETOPT}
-                        x265.cpp x265.h x265cli.h
+                        x265.cpp x265.h x265cli.cpp x265cli.h abrEncApp.cpp abrEncApp.h
                         $<TARGET_OBJECTS:encoder> $<TARGET_OBJECTS:common> ${ASM_OBJS})
         endif()
     else()
         add_executable(cli ../COPYING ${InputFiles} ${OutputFiles} ${GETOPT} ${X265_RC_FILE}
-                       ${ExportDefs} x265.cpp x265.h x265cli.h)
+                       ${ExportDefs} x265.cpp x265.h x265cli.cpp x265cli.h abrEncApp.cpp abrEncApp.h)
         if(WIN32 OR NOT ENABLE_SHARED OR INTEL_CXX)
             # The CLI cannot link to the shared library on Windows, it
             # requires internal APIs not exported from the DLL

 
@@ -29,7 +29,7 @@
 option(STATIC_LINK_CRT "Statically link C runtime for release builds" OFF)
 mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD)
 # X265_BUILD must be incremented each time the public API is changed
-set(X265_BUILD 188)
+set(X265_BUILD 192)
 configure_file("${PROJECT_SOURCE_DIR}/x265.def.in"
                "${PROJECT_BINARY_DIR}/x265.def")
 configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in"
@@ -40,7 +40,7 @@
 # System architecture detection
 string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" SYSPROC)
 set(X86_ALIASES x86 i386 i686 x86_64 amd64)
-set(ARM_ALIASES armv6l armv7l)
+set(ARM_ALIASES armv6l armv7l aarch64)
 list(FIND X86_ALIASES "${SYSPROC}" X86MATCH)
 list(FIND ARM_ALIASES "${SYSPROC}" ARMMATCH)
 set(POWER_ALIASES ppc64 ppc64le)
@@ -70,9 +70,15 @@
     else()
         set(CROSS_COMPILE_ARM 0)
     endif()
-    message(STATUS "Detected ARM target processor")
     set(ARM 1)
-    add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=1)
+    if("${CMAKE_SIZEOF_VOID_P}" MATCHES 8)
+        message(STATUS "Detected ARM64 target processor")
+        set(ARM64 1)
+        add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=1 -DHAVE_ARMV6=0)
+    else()
+        message(STATUS "Detected ARM target processor")
+        add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=0 -DHAVE_ARMV6=1)
+    endif()
 else()
     message(STATUS "CMAKE_SYSTEM_PROCESSOR value `${CMAKE_SYSTEM_PROCESSOR}` is unknown")
     message(STATUS "Please add this value near ${CMAKE_CURRENT_LIST_FILE}:${CMAKE_CURRENT_LIST_LINE}")
@@ -95,6 +101,8 @@
         if(NUMA_FOUND)
             link_directories(${NUMA_LIBRARY_DIR})
             list(APPEND CMAKE_REQUIRED_LIBRARIES numa)
+            list(APPEND CMAKE_REQUIRED_INCLUDES ${NUMA_INCLUDE_DIR})
+            list(APPEND CMAKE_REQUIRED_LINK_OPTIONS "-L${NUMA_LIBRARY_DIR}")
             check_symbol_exists(numa_node_of_cpu numa.h NUMA_V2)
             if(NUMA_V2)
                 add_definitions(-DHAVE_LIBNUMA)
@@ -231,14 +239,24 @@
         endif()
     endif()
     if(ARM AND CROSS_COMPILE_ARM)
-        set(ARM_ARGS -march=armv6 -mfloat-abi=soft -mfpu=vfp -marm -fPIC)
+        if(ARM64)
+            set(ARM_ARGS -fPIC)
+        else()
+            set(ARM_ARGS -march=armv6 -mfloat-abi=soft -mfpu=vfp -marm -fPIC)
+        endif()
+        message(STATUS "cross compile arm")
     elseif(ARM)
-        find_package(Neon)
-        if(CPU_HAS_NEON)
-            set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=neon -marm -fPIC)
+        if(ARM64)
+            set(ARM_ARGS -fPIC)
             add_definitions(-DHAVE_NEON)
         else()
-            set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=vfp -marm)
+            find_package(Neon)
+            if(CPU_HAS_NEON)
+                set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=neon -marm -fPIC)
+                add_definitions(-DHAVE_NEON)
+            else()
+                set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=vfp -marm)
+            endif()
         endif()
     endif()
     add_definitions(${ARM_ARGS})
@@ -518,7 +536,11 @@
     # compile ARM arch asm files here
         enable_language(ASM)
         foreach(ASM ${ARM_ASMS})
-            set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/arm/${ASM})
+            if(ARM64)
+                set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/aarch64/${ASM})
+            else()
+                set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/arm/${ASM})
+            endif()
             list(APPEND ASM_SRCS ${ASM_SRC})
             list(APPEND ASM_OBJS ${ASM}.${SUFFIX})
             add_custom_command(
@@ -725,16 +747,16 @@
         # Xcode seems unable to link the CLI with libs, so link as one targget
         if(ENABLE_HDR10_PLUS)
         add_executable(cli ../COPYING ${InputFiles} ${OutputFiles} ${GETOPT}
-                        x265.cpp x265.h x265cli.h
+                        x265.cpp x265.h x265cli.cpp x265cli.h abrEncApp.cpp abrEncApp.h
                         $<TARGET_OBJECTS:encoder> $<TARGET_OBJECTS:common> $<TARGET_OBJECTS:dynamicHDR10> ${ASM_OBJS})
         else()
             add_executable(cli ../COPYING ${InputFiles} ${OutputFiles} ${GETOPT}
-                        x265.cpp x265.h x265cli.h
+                        x265.cpp x265.h x265cli.cpp x265cli.h abrEncApp.cpp abrEncApp.h
                         $<TARGET_OBJECTS:encoder> $<TARGET_OBJECTS:common> ${ASM_OBJS})
         endif()
     else()
         add_executable(cli ../COPYING ${InputFiles} ${OutputFiles} ${GETOPT} ${X265_RC_FILE}
-                       ${ExportDefs} x265.cpp x265.h x265cli.h)
+                       ${ExportDefs} x265.cpp x265.h x265cli.cpp x265cli.h abrEncApp.cpp abrEncApp.h)
         if(WIN32 OR NOT ENABLE_SHARED OR INTEL_CXX)
             # The CLI cannot link to the shared library on Windows, it
             # requires internal APIs not exported from the DLL
​

x265_3.4.tar.gz/source/abrEncApp.cpp Added

@@ -0,0 +1,1108 @@
+/*****************************************************************************
+* Copyright (C) 2013-2020 MulticoreWare, Inc
+*
+* Authors: Pooja Venkatesan <pooja@multicorewareinc.com>
+*          Aruna Matheswaran <aruna@multicorewareinc.com>
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by
+* the Free Software Foundation; either version 2 of the License, or
+* (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+*
+* This program is also available under a commercial proprietary license.
+* For more information, contact us at license @ x265.com.
+*****************************************************************************/
+
+#include "abrEncApp.h"
+#include "mv.h"
+#include "slice.h"
+#include "param.h"
+
+#include <signal.h>
+#include <errno.h>
+
+#include <queue>
+
+using namespace X265_NS;
+
+/* Ctrl-C handler */
+static volatile sig_atomic_t b_ctrl_c /* = 0 */;
+static void sigint_handler(int)
+{
+    b_ctrl_c = 1;
+}
+
+namespace X265_NS {
+    // private namespace
+#define X265_INPUT_QUEUE_SIZE 250
+
+    AbrEncoder::AbrEncoder(CLIOptions cliopt[], uint8_t numEncodes, int &ret)
+    {
+        m_numEncodes = numEncodes;
+        m_numActiveEncodes.set(numEncodes);
+        m_queueSize = (numEncodes > 1) ? X265_INPUT_QUEUE_SIZE : 1;
+        m_passEnc = X265_MALLOC(PassEncoder*, m_numEncodes);
+
+        for (uint8_t i = 0; i < m_numEncodes; i++)
+        {
+            m_passEnc[i] = new PassEncoder(i, cliopt[i], this);
+            if (!m_passEnc[i])
+            {
+                x265_log(NULL, X265_LOG_ERROR, "Unable to allocate memory for passEncoder\n");
+                ret = 4;
+            }
+            m_passEnc[i]->init(ret);
+        }
+
+        if (!allocBuffers())
+        {
+            x265_log(NULL, X265_LOG_ERROR, "Unable to allocate memory for buffers\n");
+            ret = 4;
+        }
+
+        /* start passEncoder worker threads */
+        for (uint8_t pass = 0; pass < m_numEncodes; pass++)
+            m_passEnc[pass]->startThreads();
+    }
+
+    bool AbrEncoder::allocBuffers()
+    {
+        m_inputPicBuffer = X265_MALLOC(x265_picture**, m_numEncodes);
+        m_analysisBuffer = X265_MALLOC(x265_analysis_data*, m_numEncodes);
+
+        m_picWriteCnt = new ThreadSafeInteger[m_numEncodes];
+        m_picReadCnt = new ThreadSafeInteger[m_numEncodes];
+        m_analysisWriteCnt = new ThreadSafeInteger[m_numEncodes];
+        m_analysisReadCnt = new ThreadSafeInteger[m_numEncodes];
+
+        m_picIdxReadCnt = X265_MALLOC(ThreadSafeInteger*, m_numEncodes);
+        m_analysisWrite = X265_MALLOC(ThreadSafeInteger*, m_numEncodes);
+        m_analysisRead = X265_MALLOC(ThreadSafeInteger*, m_numEncodes);
+        m_readFlag = X265_MALLOC(int*, m_numEncodes);
+
+        for (uint8_t pass = 0; pass < m_numEncodes; pass++)
+        {
+            m_inputPicBuffer[pass] = X265_MALLOC(x265_picture*, m_queueSize);
+            for (uint32_t idx = 0; idx < m_queueSize; idx++)
+            {
+                m_inputPicBuffer[pass][idx] = x265_picture_alloc();
+                x265_picture_init(m_passEnc[pass]->m_param, m_inputPicBuffer[pass][idx]);
+            }
+
+            m_analysisBuffer[pass] = X265_MALLOC(x265_analysis_data, m_queueSize);
+            m_picIdxReadCnt[pass] = new ThreadSafeInteger[m_queueSize];
+            m_analysisWrite[pass] = new ThreadSafeInteger[m_queueSize];
+            m_analysisRead[pass] = new ThreadSafeInteger[m_queueSize];
+            m_readFlag[pass] = X265_MALLOC(int, m_queueSize);
+        }
+        return true;
+    }
+
+    void AbrEncoder::destroy()
+    {
+        x265_cleanup(); /* Free library singletons */
+        for (uint8_t pass = 0; pass < m_numEncodes; pass++)
+        {
+            for (uint32_t index = 0; index < m_queueSize; index++)
+            {
+                X265_FREE(m_inputPicBuffer[pass][index]->planes[0]);
+                x265_picture_free(m_inputPicBuffer[pass][index]);
+            }
+
+            X265_FREE(m_inputPicBuffer[pass]);
+            X265_FREE(m_analysisBuffer[pass]);
+            X265_FREE(m_readFlag[pass]);
+            delete[] m_picIdxReadCnt[pass];
+            delete[] m_analysisWrite[pass];
+            delete[] m_analysisRead[pass];
+            m_passEnc[pass]->destroy();
+            delete m_passEnc[pass];
+        }
+        X265_FREE(m_inputPicBuffer);
+        X265_FREE(m_analysisBuffer);
+        X265_FREE(m_readFlag);
+
+        delete[] m_picWriteCnt;
+        delete[] m_picReadCnt;
+        delete[] m_analysisWriteCnt;
+        delete[] m_analysisReadCnt;
+
+        X265_FREE(m_picIdxReadCnt);
+        X265_FREE(m_analysisWrite);
+        X265_FREE(m_analysisRead);
+
+        X265_FREE(m_passEnc);
+    }
+
+    PassEncoder::PassEncoder(uint32_t id, CLIOptions cliopt, AbrEncoder *parent)
+    {
+        m_id = id;
+        m_cliopt = cliopt;
+        m_parent = parent;
+        if(!(m_cliopt.enableScaler && m_id))
+            m_input = m_cliopt.input;
+        m_param = cliopt.param;
+        m_inputOver = false;
+        m_lastIdx = -1;
+        m_encoder = NULL;
+        m_scaler = NULL;
+        m_reader = NULL;
+        m_ret = 0;
+    }
+
+    int PassEncoder::init(int &result)
+    {
+        if (m_parent->m_numEncodes > 1)
+            setReuseLevel();
+                
+        if (!(m_cliopt.enableScaler && m_id))
+            m_reader = new Reader(m_id, this);
+        else
+        {
+            VideoDesc *src = NULL, *dst = NULL;
+            dst = new VideoDesc(m_param->sourceWidth, m_param->sourceHeight, m_param->internalCsp, m_param->internalBitDepth);
+            int dstW = m_parent->m_passEnc[m_id - 1]->m_param->sourceWidth;
+            int dstH = m_parent->m_passEnc[m_id - 1]->m_param->sourceHeight;
+            src = new VideoDesc(dstW, dstH, m_param->internalCsp, m_param->internalBitDepth);
+            if (src != NULL && dst != NULL)
+            {
+                m_scaler = new Scaler(0, 1, m_id, src, dst, this);
+                if (!m_scaler)
+                {
+                    x265_log(m_param, X265_LOG_ERROR, "\n MALLOC failure in Scaler");
+                    result = 4;
+                }
+            }
+        }
+
+        /* note: we could try to acquire a different libx265 API here based on
+        * the profile found during option parsing, but it must be done before
+        * opening an encoder */
+
+        if (m_param)
+            m_encoder = m_cliopt.api->encoder_open(m_param);
+        if (!m_encoder)
+        {
+            x265_log(NULL, X265_LOG_ERROR, "x265_encoder_open() failed for Enc, \n");
+            m_ret = 2;
+            return -1;
+        }
+

 
@@ -0,0 +1,1108 @@
+/*****************************************************************************
+* Copyright (C) 2013-2020 MulticoreWare, Inc
+*
+* Authors: Pooja Venkatesan <pooja@multicorewareinc.com>
+*          Aruna Matheswaran <aruna@multicorewareinc.com>
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by
+* the Free Software Foundation; either version 2 of the License, or
+* (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+*
+* This program is also available under a commercial proprietary license.
+* For more information, contact us at license @ x265.com.
+*****************************************************************************/
+
+#include "abrEncApp.h"
+#include "mv.h"
+#include "slice.h"
+#include "param.h"
+
+#include <signal.h>
+#include <errno.h>
+
+#include <queue>
+
+using namespace X265_NS;
+
+/* Ctrl-C handler */
+static volatile sig_atomic_t b_ctrl_c /* = 0 */;
+static void sigint_handler(int)
+{
+    b_ctrl_c = 1;
+}
+
+namespace X265_NS {
+    // private namespace
+#define X265_INPUT_QUEUE_SIZE 250
+
+    AbrEncoder::AbrEncoder(CLIOptions cliopt[], uint8_t numEncodes, int &ret)
+    {
+        m_numEncodes = numEncodes;
+        m_numActiveEncodes.set(numEncodes);
+        m_queueSize = (numEncodes > 1) ? X265_INPUT_QUEUE_SIZE : 1;
+        m_passEnc = X265_MALLOC(PassEncoder*, m_numEncodes);
+
+        for (uint8_t i = 0; i < m_numEncodes; i++)
+        {
+            m_passEnc[i] = new PassEncoder(i, cliopt[i], this);
+            if (!m_passEnc[i])
+            {
+                x265_log(NULL, X265_LOG_ERROR, "Unable to allocate memory for passEncoder\n");
+                ret = 4;
+            }
+            m_passEnc[i]->init(ret);
+        }
+
+        if (!allocBuffers())
+        {
+            x265_log(NULL, X265_LOG_ERROR, "Unable to allocate memory for buffers\n");
+            ret = 4;
+        }
+
+        /* start passEncoder worker threads */
+        for (uint8_t pass = 0; pass < m_numEncodes; pass++)
+            m_passEnc[pass]->startThreads();
+    }
+
+    bool AbrEncoder::allocBuffers()
+    {
+        m_inputPicBuffer = X265_MALLOC(x265_picture**, m_numEncodes);
+        m_analysisBuffer = X265_MALLOC(x265_analysis_data*, m_numEncodes);
+
+        m_picWriteCnt = new ThreadSafeInteger[m_numEncodes];
+        m_picReadCnt = new ThreadSafeInteger[m_numEncodes];
+        m_analysisWriteCnt = new ThreadSafeInteger[m_numEncodes];
+        m_analysisReadCnt = new ThreadSafeInteger[m_numEncodes];
+
+        m_picIdxReadCnt = X265_MALLOC(ThreadSafeInteger*, m_numEncodes);
+        m_analysisWrite = X265_MALLOC(ThreadSafeInteger*, m_numEncodes);
+        m_analysisRead = X265_MALLOC(ThreadSafeInteger*, m_numEncodes);
+        m_readFlag = X265_MALLOC(int*, m_numEncodes);
+
+        for (uint8_t pass = 0; pass < m_numEncodes; pass++)
+        {
+            m_inputPicBuffer[pass] = X265_MALLOC(x265_picture*, m_queueSize);
+            for (uint32_t idx = 0; idx < m_queueSize; idx++)
+            {
+                m_inputPicBuffer[pass][idx] = x265_picture_alloc();
+                x265_picture_init(m_passEnc[pass]->m_param, m_inputPicBuffer[pass][idx]);
+            }
+
+            m_analysisBuffer[pass] = X265_MALLOC(x265_analysis_data, m_queueSize);
+            m_picIdxReadCnt[pass] = new ThreadSafeInteger[m_queueSize];
+            m_analysisWrite[pass] = new ThreadSafeInteger[m_queueSize];
+            m_analysisRead[pass] = new ThreadSafeInteger[m_queueSize];
+            m_readFlag[pass] = X265_MALLOC(int, m_queueSize);
+        }
+        return true;
+    }
+
+    void AbrEncoder::destroy()
+    {
+        x265_cleanup(); /* Free library singletons */
+        for (uint8_t pass = 0; pass < m_numEncodes; pass++)
+        {
+            for (uint32_t index = 0; index < m_queueSize; index++)
+            {
+                X265_FREE(m_inputPicBuffer[pass][index]->planes[0]);
+                x265_picture_free(m_inputPicBuffer[pass][index]);
+            }
+
+            X265_FREE(m_inputPicBuffer[pass]);
+            X265_FREE(m_analysisBuffer[pass]);
+            X265_FREE(m_readFlag[pass]);
+            delete[] m_picIdxReadCnt[pass];
+            delete[] m_analysisWrite[pass];
+            delete[] m_analysisRead[pass];
+            m_passEnc[pass]->destroy();
+            delete m_passEnc[pass];
+        }
+        X265_FREE(m_inputPicBuffer);
+        X265_FREE(m_analysisBuffer);
+        X265_FREE(m_readFlag);
+
+        delete[] m_picWriteCnt;
+        delete[] m_picReadCnt;
+        delete[] m_analysisWriteCnt;
+        delete[] m_analysisReadCnt;
+
+        X265_FREE(m_picIdxReadCnt);
+        X265_FREE(m_analysisWrite);
+        X265_FREE(m_analysisRead);
+
+        X265_FREE(m_passEnc);
+    }
+
+    PassEncoder::PassEncoder(uint32_t id, CLIOptions cliopt, AbrEncoder *parent)
+    {
+        m_id = id;
+        m_cliopt = cliopt;
+        m_parent = parent;
+        if(!(m_cliopt.enableScaler && m_id))
+            m_input = m_cliopt.input;
+        m_param = cliopt.param;
+        m_inputOver = false;
+        m_lastIdx = -1;
+        m_encoder = NULL;
+        m_scaler = NULL;
+        m_reader = NULL;
+        m_ret = 0;
+    }
+
+    int PassEncoder::init(int &result)
+    {
+        if (m_parent->m_numEncodes > 1)
+            setReuseLevel();
+                
+        if (!(m_cliopt.enableScaler && m_id))
+            m_reader = new Reader(m_id, this);
+        else
+        {
+            VideoDesc *src = NULL, *dst = NULL;
+            dst = new VideoDesc(m_param->sourceWidth, m_param->sourceHeight, m_param->internalCsp, m_param->internalBitDepth);
+            int dstW = m_parent->m_passEnc[m_id - 1]->m_param->sourceWidth;
+            int dstH = m_parent->m_passEnc[m_id - 1]->m_param->sourceHeight;
+            src = new VideoDesc(dstW, dstH, m_param->internalCsp, m_param->internalBitDepth);
+            if (src != NULL && dst != NULL)
+            {
+                m_scaler = new Scaler(0, 1, m_id, src, dst, this);
+                if (!m_scaler)
+                {
+                    x265_log(m_param, X265_LOG_ERROR, "\n MALLOC failure in Scaler");
+                    result = 4;
+                }
+            }
+        }
+
+        /* note: we could try to acquire a different libx265 API here based on
+        * the profile found during option parsing, but it must be done before
+        * opening an encoder */
+
+        if (m_param)
+            m_encoder = m_cliopt.api->encoder_open(m_param);
+        if (!m_encoder)
+        {
+            x265_log(NULL, X265_LOG_ERROR, "x265_encoder_open() failed for Enc, \n");
+            m_ret = 2;
+            return -1;
+        }
+
​

x265_3.4.tar.gz/source/abrEncApp.h Added

@@ -0,0 +1,153 @@
+/*****************************************************************************
+* Copyright (C) 2013-2020 MulticoreWare, Inc
+*
+* Authors: Pooja Venkatesan <pooja@multicorewareinc.com>
+*          Aruna Matheswaran <aruna@multicorewareinc.com>
+*           
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by
+* the Free Software Foundation; either version 2 of the License, or
+* (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+*
+* This program is also available under a commercial proprietary license.
+* For more information, contact us at license @ x265.com.
+*****************************************************************************/
+
+#ifndef ABR_ENCODE_H
+#define ABR_ENCODE_H
+
+#include "x265.h"
+#include "scaler.h"
+#include "threading.h"
+#include "x265cli.h"
+
+namespace X265_NS {
+    // private namespace
+
+    class PassEncoder;
+    class Scaler;
+    class Reader;
+
+    class AbrEncoder
+    {
+    public:
+        uint8_t           m_numEncodes;
+        PassEncoder        **m_passEnc;
+        uint32_t           m_queueSize;
+        ThreadSafeInteger  m_numActiveEncodes;
+
+        x265_picture       ***m_inputPicBuffer; //[numEncodes][queueSize]
+        x265_analysis_data **m_analysisBuffer; //[numEncodes][queueSize]
+        int                **m_readFlag;
+
+        ThreadSafeInteger  *m_picWriteCnt;
+        ThreadSafeInteger  *m_picReadCnt;
+        ThreadSafeInteger  **m_picIdxReadCnt;
+        ThreadSafeInteger  *m_analysisWriteCnt; //[numEncodes][queueSize]
+        ThreadSafeInteger  *m_analysisReadCnt; //[numEncodes][queueSize]
+        ThreadSafeInteger  **m_analysisWrite; //[numEncodes][queueSize]
+        ThreadSafeInteger  **m_analysisRead; //[numEncodes][queueSize]
+
+        AbrEncoder(CLIOptions cliopt[], uint8_t numEncodes, int& ret);
+        bool allocBuffers();
+        void destroy();
+
+    };
+
+    class PassEncoder : public Thread
+    {
+    public:
+
+        uint32_t m_id;
+        x265_param *m_param;
+        AbrEncoder *m_parent;
+        x265_encoder *m_encoder;
+        Reader *m_reader;
+        Scaler *m_scaler;
+        bool m_inputOver;
+
+        int m_threadActive;
+        int m_lastIdx;
+        uint32_t m_outputNalsCount;
+
+        x265_picture **m_inputPicBuffer;
+        x265_analysis_data **m_analysisBuffer;
+        x265_nal **m_outputNals;
+        x265_picture **m_outputRecon;
+
+        CLIOptions m_cliopt;
+        InputFile* m_input;
+        const char* m_reconPlayCmd;
+        FILE*    m_qpfile;
+        FILE*    m_zoneFile;
+        FILE*    m_dolbyVisionRpu;/* File containing Dolby Vision BL RPU metadata */
+
+        int m_ret;
+
+        PassEncoder(uint32_t id, CLIOptions cliopt, AbrEncoder *parent);
+        int init(int &result);
+        void setReuseLevel();
+
+        void startThreads();
+        void copyInfo(x265_analysis_data *src);
+
+        bool readPicture(x265_picture*);
+        void destroy();
+
+    private:
+        void threadMain();
+    };
+
+    class Scaler : public Thread
+    {
+    public:
+        PassEncoder *m_parentEnc;
+        int m_id;
+        int m_scalePlanes[3];
+        int m_scaleFrameSize;
+        uint32_t m_threadId;
+        uint32_t m_threadTotal;
+        ThreadSafeInteger m_scaledWriteCnt;
+        VideoDesc* m_srcFormat;
+        VideoDesc* m_dstFormat;
+        int m_threadActive;
+        ScalerFilterManager* m_filterManager;
+
+        Scaler(int threadId, int threadNum, int id, VideoDesc *src, VideoDesc * dst, PassEncoder *parentEnc);
+        bool scalePic(x265_picture *destination, x265_picture *source);
+        void threadMain();
+        void destroy()
+        {
+            if (m_filterManager)
+            {
+                delete m_filterManager;
+                m_filterManager = NULL;
+            }
+        }
+    };
+
+    class Reader : public Thread
+    {
+    public:
+        PassEncoder *m_parentEnc;
+        int m_id;
+        InputFile* m_input;
+        int m_threadActive;
+
+        Reader(int id, PassEncoder *parentEnc);
+        void threadMain();
+    };
+}
+
+#endif // ifndef ABR_ENCODE_H
+#pragma once

 
@@ -0,0 +1,153 @@
+/*****************************************************************************
+* Copyright (C) 2013-2020 MulticoreWare, Inc
+*
+* Authors: Pooja Venkatesan <pooja@multicorewareinc.com>
+*          Aruna Matheswaran <aruna@multicorewareinc.com>
+*           
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by
+* the Free Software Foundation; either version 2 of the License, or
+* (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+*
+* This program is also available under a commercial proprietary license.
+* For more information, contact us at license @ x265.com.
+*****************************************************************************/
+
+#ifndef ABR_ENCODE_H
+#define ABR_ENCODE_H
+
+#include "x265.h"
+#include "scaler.h"
+#include "threading.h"
+#include "x265cli.h"
+
+namespace X265_NS {
+    // private namespace
+
+    class PassEncoder;
+    class Scaler;
+    class Reader;
+
+    class AbrEncoder
+    {
+    public:
+        uint8_t           m_numEncodes;
+        PassEncoder        **m_passEnc;
+        uint32_t           m_queueSize;
+        ThreadSafeInteger  m_numActiveEncodes;
+
+        x265_picture       ***m_inputPicBuffer; //[numEncodes][queueSize]
+        x265_analysis_data **m_analysisBuffer; //[numEncodes][queueSize]
+        int                **m_readFlag;
+
+        ThreadSafeInteger  *m_picWriteCnt;
+        ThreadSafeInteger  *m_picReadCnt;
+        ThreadSafeInteger  **m_picIdxReadCnt;
+        ThreadSafeInteger  *m_analysisWriteCnt; //[numEncodes][queueSize]
+        ThreadSafeInteger  *m_analysisReadCnt; //[numEncodes][queueSize]
+        ThreadSafeInteger  **m_analysisWrite; //[numEncodes][queueSize]
+        ThreadSafeInteger  **m_analysisRead; //[numEncodes][queueSize]
+
+        AbrEncoder(CLIOptions cliopt[], uint8_t numEncodes, int& ret);
+        bool allocBuffers();
+        void destroy();
+
+    };
+
+    class PassEncoder : public Thread
+    {
+    public:
+
+        uint32_t m_id;
+        x265_param *m_param;
+        AbrEncoder *m_parent;
+        x265_encoder *m_encoder;
+        Reader *m_reader;
+        Scaler *m_scaler;
+        bool m_inputOver;
+
+        int m_threadActive;
+        int m_lastIdx;
+        uint32_t m_outputNalsCount;
+
+        x265_picture **m_inputPicBuffer;
+        x265_analysis_data **m_analysisBuffer;
+        x265_nal **m_outputNals;
+        x265_picture **m_outputRecon;
+
+        CLIOptions m_cliopt;
+        InputFile* m_input;
+        const char* m_reconPlayCmd;
+        FILE*    m_qpfile;
+        FILE*    m_zoneFile;
+        FILE*    m_dolbyVisionRpu;/* File containing Dolby Vision BL RPU metadata */
+
+        int m_ret;
+
+        PassEncoder(uint32_t id, CLIOptions cliopt, AbrEncoder *parent);
+        int init(int &result);
+        void setReuseLevel();
+
+        void startThreads();
+        void copyInfo(x265_analysis_data *src);
+
+        bool readPicture(x265_picture*);
+        void destroy();
+
+    private:
+        void threadMain();
+    };
+
+    class Scaler : public Thread
+    {
+    public:
+        PassEncoder *m_parentEnc;
+        int m_id;
+        int m_scalePlanes[3];
+        int m_scaleFrameSize;
+        uint32_t m_threadId;
+        uint32_t m_threadTotal;
+        ThreadSafeInteger m_scaledWriteCnt;
+        VideoDesc* m_srcFormat;
+        VideoDesc* m_dstFormat;
+        int m_threadActive;
+        ScalerFilterManager* m_filterManager;
+
+        Scaler(int threadId, int threadNum, int id, VideoDesc *src, VideoDesc * dst, PassEncoder *parentEnc);
+        bool scalePic(x265_picture *destination, x265_picture *source);
+        void threadMain();
+        void destroy()
+        {
+            if (m_filterManager)
+            {
+                delete m_filterManager;
+                m_filterManager = NULL;
+            }
+        }
+    };
+
+    class Reader : public Thread
+    {
+    public:
+        PassEncoder *m_parentEnc;
+        int m_id;
+        InputFile* m_input;
+        int m_threadActive;
+
+        Reader(int id, PassEncoder *parentEnc);
+        void threadMain();
+    };
+}
+
+#endif // ifndef ABR_ENCODE_H
+#pragma once
​

x265_3.3.tar.gz/source/common/CMakeLists.txt -> x265_3.4.tar.gz/source/common/CMakeLists.txt Changed

@@ -14,7 +14,7 @@
 endif(EXTRA_LIB)
 
 if(ENABLE_ASSEMBLY)
-    set_source_files_properties(threading.cpp primitives.cpp PROPERTIES COMPILE_FLAGS -DENABLE_ASSEMBLY=1)
+    set_source_files_properties(threading.cpp primitives.cpp pixel.cpp PROPERTIES COMPILE_FLAGS -DENABLE_ASSEMBLY=1)
     list(APPEND VFLAGS "-DENABLE_ASSEMBLY=1")
 endif(ENABLE_ASSEMBLY)
 
@@ -84,16 +84,33 @@
 endif(ENABLE_ASSEMBLY AND X86)
 
 if(ENABLE_ASSEMBLY AND (ARM OR CROSS_COMPILE_ARM))
-    set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h loopfilter.h)
+    if(ARM64)
+        if(GCC AND (CMAKE_CXX_FLAGS_RELEASE MATCHES "-O3"))
+            message(STATUS "Detected CXX compiler using -O3 optimization level")
+            add_definitions(-DAUTO_VECTORIZE=1)
+        endif()
+        set(C_SRCS asm-primitives.cpp pixel.h ipfilter8.h)
 
-    # add ARM assembly/intrinsic files here
-    set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S ssd-a.S blockcopy8.S ipfilter8.S dct-a.S)
-    set(VEC_PRIMITIVES)
+        # add ARM assembly/intrinsic files here
+        set(A_SRCS asm.S mc-a.S sad-a.S pixel-util.S ipfilter8.S)
+        set(VEC_PRIMITIVES)
 
-    set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
-    foreach(SRC ${C_SRCS})
-        set(ASM_PRIMITIVES ${ASM_PRIMITIVES} arm/${SRC})
-    endforeach()
+        set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
+        foreach(SRC ${C_SRCS})
+            set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
+        endforeach()
+    else()
+        set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h loopfilter.h)
+
+        # add ARM assembly/intrinsic files here
+        set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S ssd-a.S blockcopy8.S ipfilter8.S dct-a.S)
+        set(VEC_PRIMITIVES)
+
+        set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
+        foreach(SRC ${C_SRCS})
+            set(ASM_PRIMITIVES ${ASM_PRIMITIVES} arm/${SRC})
+        endforeach()
+    endif()
     source_group(Assembly FILES ${ASM_PRIMITIVES})
 endif(ENABLE_ASSEMBLY AND (ARM OR CROSS_COMPILE_ARM))
 
@@ -151,4 +168,5 @@
     predict.cpp  predict.h
     scalinglist.cpp scalinglist.h
     quant.cpp quant.h contexts.h
-    deblock.cpp deblock.h)
+    deblock.cpp deblock.h
+    scaler.cpp scaler.h)

 
@@ -14,7 +14,7 @@
 endif(EXTRA_LIB)
 
 if(ENABLE_ASSEMBLY)
-    set_source_files_properties(threading.cpp primitives.cpp PROPERTIES COMPILE_FLAGS -DENABLE_ASSEMBLY=1)
+    set_source_files_properties(threading.cpp primitives.cpp pixel.cpp PROPERTIES COMPILE_FLAGS -DENABLE_ASSEMBLY=1)
     list(APPEND VFLAGS "-DENABLE_ASSEMBLY=1")
 endif(ENABLE_ASSEMBLY)
 
@@ -84,16 +84,33 @@
 endif(ENABLE_ASSEMBLY AND X86)
 
 if(ENABLE_ASSEMBLY AND (ARM OR CROSS_COMPILE_ARM))
-    set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h loopfilter.h)
+    if(ARM64)
+        if(GCC AND (CMAKE_CXX_FLAGS_RELEASE MATCHES "-O3"))
+            message(STATUS "Detected CXX compiler using -O3 optimization level")
+            add_definitions(-DAUTO_VECTORIZE=1)
+        endif()
+        set(C_SRCS asm-primitives.cpp pixel.h ipfilter8.h)
 
-    # add ARM assembly/intrinsic files here
-    set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S ssd-a.S blockcopy8.S ipfilter8.S dct-a.S)
-    set(VEC_PRIMITIVES)
+        # add ARM assembly/intrinsic files here
+        set(A_SRCS asm.S mc-a.S sad-a.S pixel-util.S ipfilter8.S)
+        set(VEC_PRIMITIVES)
 
-    set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
-    foreach(SRC ${C_SRCS})
-        set(ASM_PRIMITIVES ${ASM_PRIMITIVES} arm/${SRC})
-    endforeach()
+        set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
+        foreach(SRC ${C_SRCS})
+            set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
+        endforeach()
+    else()
+        set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h loopfilter.h)
+
+        # add ARM assembly/intrinsic files here
+        set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S ssd-a.S blockcopy8.S ipfilter8.S dct-a.S)
+        set(VEC_PRIMITIVES)
+
+        set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
+        foreach(SRC ${C_SRCS})
+            set(ASM_PRIMITIVES ${ASM_PRIMITIVES} arm/${SRC})
+        endforeach()
+    endif()
     source_group(Assembly FILES ${ASM_PRIMITIVES})
 endif(ENABLE_ASSEMBLY AND (ARM OR CROSS_COMPILE_ARM))
 
@@ -151,4 +168,5 @@
     predict.cpp  predict.h
     scalinglist.cpp scalinglist.h
     quant.cpp quant.h contexts.h
-    deblock.cpp deblock.h)
+    deblock.cpp deblock.h
+    scaler.cpp scaler.h)
​

x265_3.4.tar.gz/source/common/aarch64/asm-primitives.cpp Added

@@ -0,0 +1,219 @@
+/*****************************************************************************
+ * Copyright (C) 2020 MulticoreWare, Inc
+ *
+ * Authors: Hongbin Liu <liuhongbin1@huawei.com>
+ *          Yimeng Su <yimeng.su@huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "common.h"
+#include "primitives.h"
+#include "x265.h"
+#include "cpu.h"
+
+
+#if defined(__GNUC__)
+#define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
+#endif
+
+#define GCC_4_9_0 40900
+#define GCC_5_1_0 50100
+
+extern "C" {
+#include "pixel.h"
+#include "pixel-util.h"
+#include "ipfilter8.h"
+}
+
+namespace X265_NS {
+// private x265 namespace
+
+
+template<int size>
+void interp_8tap_hv_pp_cpu(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY)
+{
+    ALIGN_VAR_32(int16_t, immed[MAX_CU_SIZE * (MAX_CU_SIZE + NTAPS_LUMA - 1)]);
+    const int halfFilterSize = NTAPS_LUMA >> 1;
+    const int immedStride = MAX_CU_SIZE;
+
+    primitives.pu[size].luma_hps(src, srcStride, immed, immedStride, idxX, 1);
+    primitives.pu[size].luma_vsp(immed + (halfFilterSize - 1) * immedStride, immedStride, dst, dstStride, idxY);
+}
+
+
+/* Temporary workaround because luma_vsp assembly primitive has not been completed
+ * but interp_8tap_hv_pp_cpu uses mixed C primitive and assembly primitive.
+ * Otherwise, segment fault occurs. */
+void setupAliasCPrimitives(EncoderPrimitives &cp, EncoderPrimitives &asmp, int cpuMask)
+{
+    if (cpuMask & X265_CPU_NEON)
+    {
+        asmp.pu[LUMA_8x4].luma_vsp   = cp.pu[LUMA_8x4].luma_vsp;
+        asmp.pu[LUMA_8x8].luma_vsp   = cp.pu[LUMA_8x8].luma_vsp;
+        asmp.pu[LUMA_8x16].luma_vsp  = cp.pu[LUMA_8x16].luma_vsp;
+        asmp.pu[LUMA_8x32].luma_vsp  = cp.pu[LUMA_8x32].luma_vsp;
+        asmp.pu[LUMA_12x16].luma_vsp = cp.pu[LUMA_12x16].luma_vsp;
+#if !AUTO_VECTORIZE || GCC_VERSION < GCC_5_1_0 /* gcc_version < gcc-5.1.0 */
+        asmp.pu[LUMA_16x4].luma_vsp  = cp.pu[LUMA_16x4].luma_vsp;
+        asmp.pu[LUMA_16x8].luma_vsp  = cp.pu[LUMA_16x8].luma_vsp;
+        asmp.pu[LUMA_16x12].luma_vsp = cp.pu[LUMA_16x12].luma_vsp;
+        asmp.pu[LUMA_16x16].luma_vsp = cp.pu[LUMA_16x16].luma_vsp;
+        asmp.pu[LUMA_16x32].luma_vsp = cp.pu[LUMA_16x32].luma_vsp;
+        asmp.pu[LUMA_16x64].luma_vsp = cp.pu[LUMA_16x64].luma_vsp;
+        asmp.pu[LUMA_32x16].luma_vsp = cp.pu[LUMA_32x16].luma_vsp;
+        asmp.pu[LUMA_32x24].luma_vsp = cp.pu[LUMA_32x24].luma_vsp;
+        asmp.pu[LUMA_32x32].luma_vsp = cp.pu[LUMA_32x32].luma_vsp;
+        asmp.pu[LUMA_32x64].luma_vsp = cp.pu[LUMA_32x64].luma_vsp;
+        asmp.pu[LUMA_48x64].luma_vsp = cp.pu[LUMA_48x64].luma_vsp;
+        asmp.pu[LUMA_64x16].luma_vsp = cp.pu[LUMA_64x16].luma_vsp;
+        asmp.pu[LUMA_64x32].luma_vsp = cp.pu[LUMA_64x32].luma_vsp;
+        asmp.pu[LUMA_64x48].luma_vsp = cp.pu[LUMA_64x48].luma_vsp;
+        asmp.pu[LUMA_64x64].luma_vsp = cp.pu[LUMA_64x64].luma_vsp;    
+#if !AUTO_VECTORIZE || GCC_VERSION < GCC_4_9_0 /* gcc_version < gcc-4.9.0 */
+        asmp.pu[LUMA_4x4].luma_vsp   = cp.pu[LUMA_4x4].luma_vsp;
+        asmp.pu[LUMA_4x8].luma_vsp   = cp.pu[LUMA_4x8].luma_vsp;
+        asmp.pu[LUMA_4x16].luma_vsp  = cp.pu[LUMA_4x16].luma_vsp;
+        asmp.pu[LUMA_24x32].luma_vsp = cp.pu[LUMA_24x32].luma_vsp;
+        asmp.pu[LUMA_32x8].luma_vsp  = cp.pu[LUMA_32x8].luma_vsp;
+#endif
+#endif
+    }
+}
+
+
+void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask) 
+{
+    if (cpuMask & X265_CPU_NEON)
+    {
+        p.pu[LUMA_4x4].satd   = PFX(pixel_satd_4x4_neon);
+        p.pu[LUMA_4x8].satd   = PFX(pixel_satd_4x8_neon);
+        p.pu[LUMA_4x16].satd  = PFX(pixel_satd_4x16_neon);
+        p.pu[LUMA_8x4].satd   = PFX(pixel_satd_8x4_neon);
+        p.pu[LUMA_8x8].satd   = PFX(pixel_satd_8x8_neon);
+        p.pu[LUMA_12x16].satd = PFX(pixel_satd_12x16_neon);
+        
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd    = PFX(pixel_satd_4x4_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd    = PFX(pixel_satd_4x8_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].satd   = PFX(pixel_satd_4x16_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd    = PFX(pixel_satd_8x4_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].satd    = PFX(pixel_satd_8x8_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].satd  = PFX(pixel_satd_12x16_neon);
+        
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd    = PFX(pixel_satd_4x4_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd    = PFX(pixel_satd_4x8_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd   = PFX(pixel_satd_4x16_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].satd   = PFX(pixel_satd_4x32_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd    = PFX(pixel_satd_8x4_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].satd    = PFX(pixel_satd_8x8_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].satd  = PFX(pixel_satd_12x32_neon);
+
+        p.pu[LUMA_4x4].pixelavg_pp[NONALIGNED]   = PFX(pixel_avg_pp_4x4_neon);
+        p.pu[LUMA_4x8].pixelavg_pp[NONALIGNED]   = PFX(pixel_avg_pp_4x8_neon);
+        p.pu[LUMA_4x16].pixelavg_pp[NONALIGNED]  = PFX(pixel_avg_pp_4x16_neon);
+        p.pu[LUMA_8x4].pixelavg_pp[NONALIGNED]   = PFX(pixel_avg_pp_8x4_neon);
+        p.pu[LUMA_8x8].pixelavg_pp[NONALIGNED]   = PFX(pixel_avg_pp_8x8_neon);
+        p.pu[LUMA_8x16].pixelavg_pp[NONALIGNED]  = PFX(pixel_avg_pp_8x16_neon);
+        p.pu[LUMA_8x32].pixelavg_pp[NONALIGNED]  = PFX(pixel_avg_pp_8x32_neon);
+
+        p.pu[LUMA_4x4].pixelavg_pp[ALIGNED]   = PFX(pixel_avg_pp_4x4_neon);
+        p.pu[LUMA_4x8].pixelavg_pp[ALIGNED]   = PFX(pixel_avg_pp_4x8_neon);
+        p.pu[LUMA_4x16].pixelavg_pp[ALIGNED]  = PFX(pixel_avg_pp_4x16_neon);
+        p.pu[LUMA_8x4].pixelavg_pp[ALIGNED]   = PFX(pixel_avg_pp_8x4_neon);
+        p.pu[LUMA_8x8].pixelavg_pp[ALIGNED]   = PFX(pixel_avg_pp_8x8_neon);
+        p.pu[LUMA_8x16].pixelavg_pp[ALIGNED]  = PFX(pixel_avg_pp_8x16_neon);
+        p.pu[LUMA_8x32].pixelavg_pp[ALIGNED]  = PFX(pixel_avg_pp_8x32_neon);
+
+        p.pu[LUMA_8x4].sad_x3   = PFX(sad_x3_8x4_neon);
+        p.pu[LUMA_8x8].sad_x3   = PFX(sad_x3_8x8_neon);
+        p.pu[LUMA_8x16].sad_x3  = PFX(sad_x3_8x16_neon);
+        p.pu[LUMA_8x32].sad_x3  = PFX(sad_x3_8x32_neon);
+
+        p.pu[LUMA_8x4].sad_x4   = PFX(sad_x4_8x4_neon);
+        p.pu[LUMA_8x8].sad_x4   = PFX(sad_x4_8x8_neon);
+        p.pu[LUMA_8x16].sad_x4  = PFX(sad_x4_8x16_neon);
+        p.pu[LUMA_8x32].sad_x4  = PFX(sad_x4_8x32_neon);
+
+        // quant
+        p.quant = PFX(quant_neon);
+        // luma_hps
+        p.pu[LUMA_4x4].luma_hps   = PFX(interp_8tap_horiz_ps_4x4_neon);
+        p.pu[LUMA_4x8].luma_hps   = PFX(interp_8tap_horiz_ps_4x8_neon);
+        p.pu[LUMA_4x16].luma_hps  = PFX(interp_8tap_horiz_ps_4x16_neon);
+        p.pu[LUMA_8x4].luma_hps   = PFX(interp_8tap_horiz_ps_8x4_neon);
+        p.pu[LUMA_8x8].luma_hps   = PFX(interp_8tap_horiz_ps_8x8_neon);
+        p.pu[LUMA_8x16].luma_hps  = PFX(interp_8tap_horiz_ps_8x16_neon);
+        p.pu[LUMA_8x32].luma_hps  = PFX(interp_8tap_horiz_ps_8x32_neon);
+        p.pu[LUMA_12x16].luma_hps = PFX(interp_8tap_horiz_ps_12x16_neon);
+        p.pu[LUMA_24x32].luma_hps = PFX(interp_8tap_horiz_ps_24x32_neon);
+#if !AUTO_VECTORIZE || GCC_VERSION < GCC_5_1_0 /* gcc_version < gcc-5.1.0 */
+        p.pu[LUMA_16x4].luma_hps  = PFX(interp_8tap_horiz_ps_16x4_neon);
+        p.pu[LUMA_16x8].luma_hps  = PFX(interp_8tap_horiz_ps_16x8_neon);
+        p.pu[LUMA_16x12].luma_hps = PFX(interp_8tap_horiz_ps_16x12_neon);
+        p.pu[LUMA_16x16].luma_hps = PFX(interp_8tap_horiz_ps_16x16_neon);
+        p.pu[LUMA_16x32].luma_hps = PFX(interp_8tap_horiz_ps_16x32_neon);
+        p.pu[LUMA_16x64].luma_hps = PFX(interp_8tap_horiz_ps_16x64_neon);
+        p.pu[LUMA_32x8].luma_hps  = PFX(interp_8tap_horiz_ps_32x8_neon);
+        p.pu[LUMA_32x16].luma_hps = PFX(interp_8tap_horiz_ps_32x16_neon);
+        p.pu[LUMA_32x24].luma_hps = PFX(interp_8tap_horiz_ps_32x24_neon);
+        p.pu[LUMA_32x32].luma_hps = PFX(interp_8tap_horiz_ps_32x32_neon);
+        p.pu[LUMA_32x64].luma_hps = PFX(interp_8tap_horiz_ps_32x64_neon);
+        p.pu[LUMA_48x64].luma_hps = PFX(interp_8tap_horiz_ps_48x64_neon);
+        p.pu[LUMA_64x16].luma_hps = PFX(interp_8tap_horiz_ps_64x16_neon);
+        p.pu[LUMA_64x32].luma_hps = PFX(interp_8tap_horiz_ps_64x32_neon);
+        p.pu[LUMA_64x48].luma_hps = PFX(interp_8tap_horiz_ps_64x48_neon);
+        p.pu[LUMA_64x64].luma_hps = PFX(interp_8tap_horiz_ps_64x64_neon);
+#endif
+
+        p.pu[LUMA_8x4].luma_hvpp   =  interp_8tap_hv_pp_cpu<LUMA_8x4>;
+        p.pu[LUMA_8x8].luma_hvpp   =  interp_8tap_hv_pp_cpu<LUMA_8x8>;
+        p.pu[LUMA_8x16].luma_hvpp  =  interp_8tap_hv_pp_cpu<LUMA_8x16>;
+        p.pu[LUMA_8x32].luma_hvpp  =  interp_8tap_hv_pp_cpu<LUMA_8x32>;
+        p.pu[LUMA_12x16].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_12x16>;
+#if !AUTO_VECTORIZE || GCC_VERSION < GCC_5_1_0 /* gcc_version < gcc-5.1.0 */
+        p.pu[LUMA_16x4].luma_hvpp  =  interp_8tap_hv_pp_cpu<LUMA_16x4>;
+        p.pu[LUMA_16x8].luma_hvpp  =  interp_8tap_hv_pp_cpu<LUMA_16x8>;
+        p.pu[LUMA_16x12].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_16x12>;
+        p.pu[LUMA_16x16].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_16x16>;
+        p.pu[LUMA_16x32].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_16x32>;
+        p.pu[LUMA_16x64].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_16x64>;
+        p.pu[LUMA_32x16].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_32x16>;
+        p.pu[LUMA_32x24].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_32x24>;
+        p.pu[LUMA_32x32].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_32x32>;
+        p.pu[LUMA_32x64].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_32x64>;
+        p.pu[LUMA_48x64].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_48x64>;

 
@@ -0,0 +1,219 @@
+/*****************************************************************************
+ * Copyright (C) 2020 MulticoreWare, Inc
+ *
+ * Authors: Hongbin Liu <liuhongbin1@huawei.com>
+ *          Yimeng Su <yimeng.su@huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "common.h"
+#include "primitives.h"
+#include "x265.h"
+#include "cpu.h"
+
+
+#if defined(__GNUC__)
+#define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
+#endif
+
+#define GCC_4_9_0 40900
+#define GCC_5_1_0 50100
+
+extern "C" {
+#include "pixel.h"
+#include "pixel-util.h"
+#include "ipfilter8.h"
+}
+
+namespace X265_NS {
+// private x265 namespace
+
+
+template<int size>
+void interp_8tap_hv_pp_cpu(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY)
+{
+    ALIGN_VAR_32(int16_t, immed[MAX_CU_SIZE * (MAX_CU_SIZE + NTAPS_LUMA - 1)]);
+    const int halfFilterSize = NTAPS_LUMA >> 1;
+    const int immedStride = MAX_CU_SIZE;
+
+    primitives.pu[size].luma_hps(src, srcStride, immed, immedStride, idxX, 1);
+    primitives.pu[size].luma_vsp(immed + (halfFilterSize - 1) * immedStride, immedStride, dst, dstStride, idxY);
+}
+
+
+/* Temporary workaround because luma_vsp assembly primitive has not been completed
+ * but interp_8tap_hv_pp_cpu uses mixed C primitive and assembly primitive.
+ * Otherwise, segment fault occurs. */
+void setupAliasCPrimitives(EncoderPrimitives &cp, EncoderPrimitives &asmp, int cpuMask)
+{
+    if (cpuMask & X265_CPU_NEON)
+    {
+        asmp.pu[LUMA_8x4].luma_vsp   = cp.pu[LUMA_8x4].luma_vsp;
+        asmp.pu[LUMA_8x8].luma_vsp   = cp.pu[LUMA_8x8].luma_vsp;
+        asmp.pu[LUMA_8x16].luma_vsp  = cp.pu[LUMA_8x16].luma_vsp;
+        asmp.pu[LUMA_8x32].luma_vsp  = cp.pu[LUMA_8x32].luma_vsp;
+        asmp.pu[LUMA_12x16].luma_vsp = cp.pu[LUMA_12x16].luma_vsp;
+#if !AUTO_VECTORIZE || GCC_VERSION < GCC_5_1_0 /* gcc_version < gcc-5.1.0 */
+        asmp.pu[LUMA_16x4].luma_vsp  = cp.pu[LUMA_16x4].luma_vsp;
+        asmp.pu[LUMA_16x8].luma_vsp  = cp.pu[LUMA_16x8].luma_vsp;
+        asmp.pu[LUMA_16x12].luma_vsp = cp.pu[LUMA_16x12].luma_vsp;
+        asmp.pu[LUMA_16x16].luma_vsp = cp.pu[LUMA_16x16].luma_vsp;
+        asmp.pu[LUMA_16x32].luma_vsp = cp.pu[LUMA_16x32].luma_vsp;
+        asmp.pu[LUMA_16x64].luma_vsp = cp.pu[LUMA_16x64].luma_vsp;
+        asmp.pu[LUMA_32x16].luma_vsp = cp.pu[LUMA_32x16].luma_vsp;
+        asmp.pu[LUMA_32x24].luma_vsp = cp.pu[LUMA_32x24].luma_vsp;
+        asmp.pu[LUMA_32x32].luma_vsp = cp.pu[LUMA_32x32].luma_vsp;
+        asmp.pu[LUMA_32x64].luma_vsp = cp.pu[LUMA_32x64].luma_vsp;
+        asmp.pu[LUMA_48x64].luma_vsp = cp.pu[LUMA_48x64].luma_vsp;
+        asmp.pu[LUMA_64x16].luma_vsp = cp.pu[LUMA_64x16].luma_vsp;
+        asmp.pu[LUMA_64x32].luma_vsp = cp.pu[LUMA_64x32].luma_vsp;
+        asmp.pu[LUMA_64x48].luma_vsp = cp.pu[LUMA_64x48].luma_vsp;
+        asmp.pu[LUMA_64x64].luma_vsp = cp.pu[LUMA_64x64].luma_vsp;    
+#if !AUTO_VECTORIZE || GCC_VERSION < GCC_4_9_0 /* gcc_version < gcc-4.9.0 */
+        asmp.pu[LUMA_4x4].luma_vsp   = cp.pu[LUMA_4x4].luma_vsp;
+        asmp.pu[LUMA_4x8].luma_vsp   = cp.pu[LUMA_4x8].luma_vsp;
+        asmp.pu[LUMA_4x16].luma_vsp  = cp.pu[LUMA_4x16].luma_vsp;
+        asmp.pu[LUMA_24x32].luma_vsp = cp.pu[LUMA_24x32].luma_vsp;
+        asmp.pu[LUMA_32x8].luma_vsp  = cp.pu[LUMA_32x8].luma_vsp;
+#endif
+#endif
+    }
+}
+
+
+void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask) 
+{
+    if (cpuMask & X265_CPU_NEON)
+    {
+        p.pu[LUMA_4x4].satd   = PFX(pixel_satd_4x4_neon);
+        p.pu[LUMA_4x8].satd   = PFX(pixel_satd_4x8_neon);
+        p.pu[LUMA_4x16].satd  = PFX(pixel_satd_4x16_neon);
+        p.pu[LUMA_8x4].satd   = PFX(pixel_satd_8x4_neon);
+        p.pu[LUMA_8x8].satd   = PFX(pixel_satd_8x8_neon);
+        p.pu[LUMA_12x16].satd = PFX(pixel_satd_12x16_neon);
+        
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd    = PFX(pixel_satd_4x4_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd    = PFX(pixel_satd_4x8_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].satd   = PFX(pixel_satd_4x16_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd    = PFX(pixel_satd_8x4_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].satd    = PFX(pixel_satd_8x8_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].satd  = PFX(pixel_satd_12x16_neon);
+        
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd    = PFX(pixel_satd_4x4_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd    = PFX(pixel_satd_4x8_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd   = PFX(pixel_satd_4x16_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].satd   = PFX(pixel_satd_4x32_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd    = PFX(pixel_satd_8x4_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].satd    = PFX(pixel_satd_8x8_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].satd  = PFX(pixel_satd_12x32_neon);
+
+        p.pu[LUMA_4x4].pixelavg_pp[NONALIGNED]   = PFX(pixel_avg_pp_4x4_neon);
+        p.pu[LUMA_4x8].pixelavg_pp[NONALIGNED]   = PFX(pixel_avg_pp_4x8_neon);
+        p.pu[LUMA_4x16].pixelavg_pp[NONALIGNED]  = PFX(pixel_avg_pp_4x16_neon);
+        p.pu[LUMA_8x4].pixelavg_pp[NONALIGNED]   = PFX(pixel_avg_pp_8x4_neon);
+        p.pu[LUMA_8x8].pixelavg_pp[NONALIGNED]   = PFX(pixel_avg_pp_8x8_neon);
+        p.pu[LUMA_8x16].pixelavg_pp[NONALIGNED]  = PFX(pixel_avg_pp_8x16_neon);
+        p.pu[LUMA_8x32].pixelavg_pp[NONALIGNED]  = PFX(pixel_avg_pp_8x32_neon);
+
+        p.pu[LUMA_4x4].pixelavg_pp[ALIGNED]   = PFX(pixel_avg_pp_4x4_neon);
+        p.pu[LUMA_4x8].pixelavg_pp[ALIGNED]   = PFX(pixel_avg_pp_4x8_neon);
+        p.pu[LUMA_4x16].pixelavg_pp[ALIGNED]  = PFX(pixel_avg_pp_4x16_neon);
+        p.pu[LUMA_8x4].pixelavg_pp[ALIGNED]   = PFX(pixel_avg_pp_8x4_neon);
+        p.pu[LUMA_8x8].pixelavg_pp[ALIGNED]   = PFX(pixel_avg_pp_8x8_neon);
+        p.pu[LUMA_8x16].pixelavg_pp[ALIGNED]  = PFX(pixel_avg_pp_8x16_neon);
+        p.pu[LUMA_8x32].pixelavg_pp[ALIGNED]  = PFX(pixel_avg_pp_8x32_neon);
+
+        p.pu[LUMA_8x4].sad_x3   = PFX(sad_x3_8x4_neon);
+        p.pu[LUMA_8x8].sad_x3   = PFX(sad_x3_8x8_neon);
+        p.pu[LUMA_8x16].sad_x3  = PFX(sad_x3_8x16_neon);
+        p.pu[LUMA_8x32].sad_x3  = PFX(sad_x3_8x32_neon);
+
+        p.pu[LUMA_8x4].sad_x4   = PFX(sad_x4_8x4_neon);
+        p.pu[LUMA_8x8].sad_x4   = PFX(sad_x4_8x8_neon);
+        p.pu[LUMA_8x16].sad_x4  = PFX(sad_x4_8x16_neon);
+        p.pu[LUMA_8x32].sad_x4  = PFX(sad_x4_8x32_neon);
+
+        // quant
+        p.quant = PFX(quant_neon);
+        // luma_hps
+        p.pu[LUMA_4x4].luma_hps   = PFX(interp_8tap_horiz_ps_4x4_neon);
+        p.pu[LUMA_4x8].luma_hps   = PFX(interp_8tap_horiz_ps_4x8_neon);
+        p.pu[LUMA_4x16].luma_hps  = PFX(interp_8tap_horiz_ps_4x16_neon);
+        p.pu[LUMA_8x4].luma_hps   = PFX(interp_8tap_horiz_ps_8x4_neon);
+        p.pu[LUMA_8x8].luma_hps   = PFX(interp_8tap_horiz_ps_8x8_neon);
+        p.pu[LUMA_8x16].luma_hps  = PFX(interp_8tap_horiz_ps_8x16_neon);
+        p.pu[LUMA_8x32].luma_hps  = PFX(interp_8tap_horiz_ps_8x32_neon);
+        p.pu[LUMA_12x16].luma_hps = PFX(interp_8tap_horiz_ps_12x16_neon);
+        p.pu[LUMA_24x32].luma_hps = PFX(interp_8tap_horiz_ps_24x32_neon);
+#if !AUTO_VECTORIZE || GCC_VERSION < GCC_5_1_0 /* gcc_version < gcc-5.1.0 */
+        p.pu[LUMA_16x4].luma_hps  = PFX(interp_8tap_horiz_ps_16x4_neon);
+        p.pu[LUMA_16x8].luma_hps  = PFX(interp_8tap_horiz_ps_16x8_neon);
+        p.pu[LUMA_16x12].luma_hps = PFX(interp_8tap_horiz_ps_16x12_neon);
+        p.pu[LUMA_16x16].luma_hps = PFX(interp_8tap_horiz_ps_16x16_neon);
+        p.pu[LUMA_16x32].luma_hps = PFX(interp_8tap_horiz_ps_16x32_neon);
+        p.pu[LUMA_16x64].luma_hps = PFX(interp_8tap_horiz_ps_16x64_neon);
+        p.pu[LUMA_32x8].luma_hps  = PFX(interp_8tap_horiz_ps_32x8_neon);
+        p.pu[LUMA_32x16].luma_hps = PFX(interp_8tap_horiz_ps_32x16_neon);
+        p.pu[LUMA_32x24].luma_hps = PFX(interp_8tap_horiz_ps_32x24_neon);
+        p.pu[LUMA_32x32].luma_hps = PFX(interp_8tap_horiz_ps_32x32_neon);
+        p.pu[LUMA_32x64].luma_hps = PFX(interp_8tap_horiz_ps_32x64_neon);
+        p.pu[LUMA_48x64].luma_hps = PFX(interp_8tap_horiz_ps_48x64_neon);
+        p.pu[LUMA_64x16].luma_hps = PFX(interp_8tap_horiz_ps_64x16_neon);
+        p.pu[LUMA_64x32].luma_hps = PFX(interp_8tap_horiz_ps_64x32_neon);
+        p.pu[LUMA_64x48].luma_hps = PFX(interp_8tap_horiz_ps_64x48_neon);
+        p.pu[LUMA_64x64].luma_hps = PFX(interp_8tap_horiz_ps_64x64_neon);
+#endif
+
+        p.pu[LUMA_8x4].luma_hvpp   =  interp_8tap_hv_pp_cpu<LUMA_8x4>;
+        p.pu[LUMA_8x8].luma_hvpp   =  interp_8tap_hv_pp_cpu<LUMA_8x8>;
+        p.pu[LUMA_8x16].luma_hvpp  =  interp_8tap_hv_pp_cpu<LUMA_8x16>;
+        p.pu[LUMA_8x32].luma_hvpp  =  interp_8tap_hv_pp_cpu<LUMA_8x32>;
+        p.pu[LUMA_12x16].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_12x16>;
+#if !AUTO_VECTORIZE || GCC_VERSION < GCC_5_1_0 /* gcc_version < gcc-5.1.0 */
+        p.pu[LUMA_16x4].luma_hvpp  =  interp_8tap_hv_pp_cpu<LUMA_16x4>;
+        p.pu[LUMA_16x8].luma_hvpp  =  interp_8tap_hv_pp_cpu<LUMA_16x8>;
+        p.pu[LUMA_16x12].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_16x12>;
+        p.pu[LUMA_16x16].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_16x16>;
+        p.pu[LUMA_16x32].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_16x32>;
+        p.pu[LUMA_16x64].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_16x64>;
+        p.pu[LUMA_32x16].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_32x16>;
+        p.pu[LUMA_32x24].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_32x24>;
+        p.pu[LUMA_32x32].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_32x32>;
+        p.pu[LUMA_32x64].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_32x64>;
+        p.pu[LUMA_48x64].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_48x64>;
​

x265_3.4.tar.gz/source/common/aarch64/asm.S Added

@@ -0,0 +1,69 @@
+/*****************************************************************************
+ * Copyright (C) 2020 MulticoreWare, Inc
+ *
+ * Authors: Hongbin Liu <liuhongbin1@huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+.arch           armv8-a
+
+#ifdef PREFIX
+#define EXTERN_ASM _
+#else
+#define EXTERN_ASM
+#endif
+
+#ifdef __ELF__
+#define ELF
+#else
+#define ELF @
+#endif
+
+#define HAVE_AS_FUNC 1
+
+#if HAVE_AS_FUNC
+#define FUNC
+#else
+#define FUNC @
+#endif
+
+.macro function name, export=1
+    .macro endfunc
+ELF     .size   \name, . - \name
+FUNC    .endfunc
+        .purgem endfunc
+    .endm
+        .align  2
+.if \export == 1
+        .global EXTERN_ASM\name
+ELF     .hidden EXTERN_ASM\name
+ELF     .type   EXTERN_ASM\name, %function
+FUNC    .func   EXTERN_ASM\name
+EXTERN_ASM\name:
+.else
+ELF     .hidden \name
+ELF     .type   \name, %function
+FUNC    .func   \name
+\name:
+.endif
+.endm
+
+
+#define FENC_STRIDE 64
+#define FDEC_STRIDE 32

 
@@ -0,0 +1,69 @@
+/*****************************************************************************
+ * Copyright (C) 2020 MulticoreWare, Inc
+ *
+ * Authors: Hongbin Liu <liuhongbin1@huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+.arch           armv8-a
+
+#ifdef PREFIX
+#define EXTERN_ASM _
+#else
+#define EXTERN_ASM
+#endif
+
+#ifdef __ELF__
+#define ELF
+#else
+#define ELF @
+#endif
+
+#define HAVE_AS_FUNC 1
+
+#if HAVE_AS_FUNC
+#define FUNC
+#else
+#define FUNC @
+#endif
+
+.macro function name, export=1
+    .macro endfunc
+ELF     .size   \name, . - \name
+FUNC    .endfunc
+        .purgem endfunc
+    .endm
+        .align  2
+.if \export == 1
+        .global EXTERN_ASM\name
+ELF     .hidden EXTERN_ASM\name
+ELF     .type   EXTERN_ASM\name, %function
+FUNC    .func   EXTERN_ASM\name
+EXTERN_ASM\name:
+.else
+ELF     .hidden \name
+ELF     .type   \name, %function
+FUNC    .func   \name
+\name:
+.endif
+.endm
+
+
+#define FENC_STRIDE 64
+#define FDEC_STRIDE 32
​

x265_3.4.tar.gz/source/common/aarch64/ipfilter8.S Added

@@ -0,0 +1,414 @@
+/*****************************************************************************
+ * Copyright (C) 2020 MulticoreWare, Inc
+ *
+ * Authors: Yimeng Su <yimeng.su@huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.section .rodata
+
+.align 4
+
+.text
+
+
+
+.macro qpel_filter_0_32b
+    movi            v24.8h, #64
+    uxtl            v19.8h, v5.8b
+    smull           v17.4s, v19.4h, v24.4h
+    smull2          v18.4s, v19.8h, v24.8h
+.endm
+
+.macro qpel_filter_1_32b
+    movi            v16.8h, #58
+    uxtl            v19.8h, v5.8b
+    smull           v17.4s, v19.4h, v16.4h
+    smull2          v18.4s, v19.8h, v16.8h
+
+    movi            v24.8h, #10
+    uxtl            v21.8h, v1.8b
+    smull           v19.4s, v21.4h, v24.4h
+    smull2          v20.4s, v21.8h, v24.8h
+
+    movi            v16.8h, #17
+    uxtl            v23.8h, v2.8b
+    smull           v21.4s, v23.4h, v16.4h
+    smull2          v22.4s, v23.8h, v16.8h
+
+    movi            v24.8h, #5
+    uxtl            v1.8h, v6.8b
+    smull           v23.4s, v1.4h, v24.4h
+    smull2          v16.4s, v1.8h, v24.8h
+
+    sub             v17.4s, v17.4s, v19.4s
+    sub             v18.4s, v18.4s, v20.4s
+
+    uxtl            v1.8h, v4.8b
+    sshll           v19.4s, v1.4h, #2
+    sshll2          v20.4s, v1.8h, #2
+
+    add             v17.4s, v17.4s, v21.4s
+    add             v18.4s, v18.4s, v22.4s
+
+    uxtl            v1.8h, v0.8b
+    uxtl            v2.8h, v3.8b
+    ssubl           v21.4s, v2.4h, v1.4h
+    ssubl2          v22.4s, v2.8h, v1.8h
+
+    add             v17.4s, v17.4s, v19.4s
+    add             v18.4s, v18.4s, v20.4s
+    sub             v21.4s, v21.4s, v23.4s
+    sub             v22.4s, v22.4s, v16.4s
+    add             v17.4s, v17.4s, v21.4s
+    add             v18.4s, v18.4s, v22.4s
+.endm
+
+.macro qpel_filter_2_32b
+    movi            v16.4s, #11
+    uxtl            v19.8h, v5.8b
+    uxtl            v20.8h, v2.8b
+    saddl           v17.4s, v19.4h, v20.4h
+    saddl2          v18.4s, v19.8h, v20.8h
+
+    uxtl            v21.8h, v1.8b
+    uxtl            v22.8h, v6.8b
+    saddl           v19.4s, v21.4h, v22.4h
+    saddl2          v20.4s, v21.8h, v22.8h
+
+    mul             v19.4s, v19.4s, v16.4s
+    mul             v20.4s, v20.4s, v16.4s
+
+    movi            v16.4s, #40
+    mul             v17.4s, v17.4s, v16.4s
+    mul             v18.4s, v18.4s, v16.4s
+
+    uxtl            v21.8h, v4.8b
+    uxtl            v22.8h, v3.8b
+    saddl           v23.4s, v21.4h, v22.4h
+    saddl2          v16.4s, v21.8h, v22.8h
+
+    uxtl            v1.8h, v0.8b
+    uxtl            v2.8h, v7.8b
+    saddl           v21.4s, v1.4h, v2.4h
+    saddl2          v22.4s, v1.8h, v2.8h
+
+    shl             v23.4s, v23.4s, #2
+    shl             v16.4s, v16.4s, #2
+
+    add             v19.4s, v19.4s, v21.4s
+    add             v20.4s, v20.4s, v22.4s
+    add             v17.4s, v17.4s, v23.4s
+    add             v18.4s, v18.4s, v16.4s
+    sub             v17.4s, v17.4s, v19.4s
+    sub             v18.4s, v18.4s, v20.4s
+.endm
+
+.macro qpel_filter_3_32b
+    movi            v16.8h, #17
+    movi            v24.8h, #5
+
+    uxtl            v19.8h, v5.8b
+    smull           v17.4s, v19.4h, v16.4h
+    smull2          v18.4s, v19.8h, v16.8h
+
+    uxtl            v21.8h, v1.8b
+    smull           v19.4s, v21.4h, v24.4h
+    smull2          v20.4s, v21.8h, v24.8h
+
+    movi            v16.8h, #58
+    uxtl            v23.8h, v2.8b
+    smull           v21.4s, v23.4h, v16.4h
+    smull2          v22.4s, v23.8h, v16.8h
+
+    movi            v24.8h, #10
+    uxtl            v1.8h, v6.8b
+    smull           v23.4s, v1.4h, v24.4h
+    smull2          v16.4s, v1.8h, v24.8h
+
+    sub             v17.4s, v17.4s, v19.4s
+    sub             v18.4s, v18.4s, v20.4s
+
+    uxtl            v1.8h, v3.8b
+    sshll           v19.4s, v1.4h, #2
+    sshll2          v20.4s, v1.8h, #2
+
+    add             v17.4s, v17.4s, v21.4s
+    add             v18.4s, v18.4s, v22.4s
+
+    uxtl            v1.8h, v4.8b
+    uxtl            v2.8h, v7.8b
+    ssubl           v21.4s, v1.4h, v2.4h
+    ssubl2          v22.4s, v1.8h, v2.8h
+
+    add             v17.4s, v17.4s, v19.4s
+    add             v18.4s, v18.4s, v20.4s
+    sub             v21.4s, v21.4s, v23.4s
+    sub             v22.4s, v22.4s, v16.4s
+    add             v17.4s, v17.4s, v21.4s
+    add             v18.4s, v18.4s, v22.4s
+.endm
+
+
+
+
+.macro vextin8
+    ld1             {v3.16b}, [x11], #16
+    mov             v7.d[0], v3.d[1]
+    ext             v0.8b, v3.8b, v7.8b, #1
+    ext             v4.8b, v3.8b, v7.8b, #2
+    ext             v1.8b, v3.8b, v7.8b, #3
+    ext             v5.8b, v3.8b, v7.8b, #4
+    ext             v2.8b, v3.8b, v7.8b, #5
+    ext             v6.8b, v3.8b, v7.8b, #6
+    ext             v3.8b, v3.8b, v7.8b, #7
+.endm
+
+
+
+// void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
+.macro HPS_FILTER a b filterhps
+    mov             w12, #8192
+    mov             w6, w10
+    sub             x3, x3, #\a
+    lsl             x3, x3, #1
+    mov             w9, #\a
+    cmp             w9, #4
+    b.eq            14f
+    cmp             w9, #12
+    b.eq            15f
+    b               7f
+14:

 
@@ -0,0 +1,414 @@
+/*****************************************************************************
+ * Copyright (C) 2020 MulticoreWare, Inc
+ *
+ * Authors: Yimeng Su <yimeng.su@huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.section .rodata
+
+.align 4
+
+.text
+
+
+
+.macro qpel_filter_0_32b
+    movi            v24.8h, #64
+    uxtl            v19.8h, v5.8b
+    smull           v17.4s, v19.4h, v24.4h
+    smull2          v18.4s, v19.8h, v24.8h
+.endm
+
+.macro qpel_filter_1_32b
+    movi            v16.8h, #58
+    uxtl            v19.8h, v5.8b
+    smull           v17.4s, v19.4h, v16.4h
+    smull2          v18.4s, v19.8h, v16.8h
+
+    movi            v24.8h, #10
+    uxtl            v21.8h, v1.8b
+    smull           v19.4s, v21.4h, v24.4h
+    smull2          v20.4s, v21.8h, v24.8h
+
+    movi            v16.8h, #17
+    uxtl            v23.8h, v2.8b
+    smull           v21.4s, v23.4h, v16.4h
+    smull2          v22.4s, v23.8h, v16.8h
+
+    movi            v24.8h, #5
+    uxtl            v1.8h, v6.8b
+    smull           v23.4s, v1.4h, v24.4h
+    smull2          v16.4s, v1.8h, v24.8h
+
+    sub             v17.4s, v17.4s, v19.4s
+    sub             v18.4s, v18.4s, v20.4s
+
+    uxtl            v1.8h, v4.8b
+    sshll           v19.4s, v1.4h, #2
+    sshll2          v20.4s, v1.8h, #2
+
+    add             v17.4s, v17.4s, v21.4s
+    add             v18.4s, v18.4s, v22.4s
+
+    uxtl            v1.8h, v0.8b
+    uxtl            v2.8h, v3.8b
+    ssubl           v21.4s, v2.4h, v1.4h
+    ssubl2          v22.4s, v2.8h, v1.8h
+
+    add             v17.4s, v17.4s, v19.4s
+    add             v18.4s, v18.4s, v20.4s
+    sub             v21.4s, v21.4s, v23.4s
+    sub             v22.4s, v22.4s, v16.4s
+    add             v17.4s, v17.4s, v21.4s
+    add             v18.4s, v18.4s, v22.4s
+.endm
+
+.macro qpel_filter_2_32b
+    movi            v16.4s, #11
+    uxtl            v19.8h, v5.8b
+    uxtl            v20.8h, v2.8b
+    saddl           v17.4s, v19.4h, v20.4h
+    saddl2          v18.4s, v19.8h, v20.8h
+
+    uxtl            v21.8h, v1.8b
+    uxtl            v22.8h, v6.8b
+    saddl           v19.4s, v21.4h, v22.4h
+    saddl2          v20.4s, v21.8h, v22.8h
+
+    mul             v19.4s, v19.4s, v16.4s
+    mul             v20.4s, v20.4s, v16.4s
+
+    movi            v16.4s, #40
+    mul             v17.4s, v17.4s, v16.4s
+    mul             v18.4s, v18.4s, v16.4s
+
+    uxtl            v21.8h, v4.8b
+    uxtl            v22.8h, v3.8b
+    saddl           v23.4s, v21.4h, v22.4h
+    saddl2          v16.4s, v21.8h, v22.8h
+
+    uxtl            v1.8h, v0.8b
+    uxtl            v2.8h, v7.8b
+    saddl           v21.4s, v1.4h, v2.4h
+    saddl2          v22.4s, v1.8h, v2.8h
+
+    shl             v23.4s, v23.4s, #2
+    shl             v16.4s, v16.4s, #2
+
+    add             v19.4s, v19.4s, v21.4s
+    add             v20.4s, v20.4s, v22.4s
+    add             v17.4s, v17.4s, v23.4s
+    add             v18.4s, v18.4s, v16.4s
+    sub             v17.4s, v17.4s, v19.4s
+    sub             v18.4s, v18.4s, v20.4s
+.endm
+
+.macro qpel_filter_3_32b
+    movi            v16.8h, #17
+    movi            v24.8h, #5
+
+    uxtl            v19.8h, v5.8b
+    smull           v17.4s, v19.4h, v16.4h
+    smull2          v18.4s, v19.8h, v16.8h
+
+    uxtl            v21.8h, v1.8b
+    smull           v19.4s, v21.4h, v24.4h
+    smull2          v20.4s, v21.8h, v24.8h
+
+    movi            v16.8h, #58
+    uxtl            v23.8h, v2.8b
+    smull           v21.4s, v23.4h, v16.4h
+    smull2          v22.4s, v23.8h, v16.8h
+
+    movi            v24.8h, #10
+    uxtl            v1.8h, v6.8b
+    smull           v23.4s, v1.4h, v24.4h
+    smull2          v16.4s, v1.8h, v24.8h
+
+    sub             v17.4s, v17.4s, v19.4s
+    sub             v18.4s, v18.4s, v20.4s
+
+    uxtl            v1.8h, v3.8b
+    sshll           v19.4s, v1.4h, #2
+    sshll2          v20.4s, v1.8h, #2
+
+    add             v17.4s, v17.4s, v21.4s
+    add             v18.4s, v18.4s, v22.4s
+
+    uxtl            v1.8h, v4.8b
+    uxtl            v2.8h, v7.8b
+    ssubl           v21.4s, v1.4h, v2.4h
+    ssubl2          v22.4s, v1.8h, v2.8h
+
+    add             v17.4s, v17.4s, v19.4s
+    add             v18.4s, v18.4s, v20.4s
+    sub             v21.4s, v21.4s, v23.4s
+    sub             v22.4s, v22.4s, v16.4s
+    add             v17.4s, v17.4s, v21.4s
+    add             v18.4s, v18.4s, v22.4s
+.endm
+
+
+
+
+.macro vextin8
+    ld1             {v3.16b}, [x11], #16
+    mov             v7.d[0], v3.d[1]
+    ext             v0.8b, v3.8b, v7.8b, #1
+    ext             v4.8b, v3.8b, v7.8b, #2
+    ext             v1.8b, v3.8b, v7.8b, #3
+    ext             v5.8b, v3.8b, v7.8b, #4
+    ext             v2.8b, v3.8b, v7.8b, #5
+    ext             v6.8b, v3.8b, v7.8b, #6
+    ext             v3.8b, v3.8b, v7.8b, #7
+.endm
+
+
+
+// void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
+.macro HPS_FILTER a b filterhps
+    mov             w12, #8192
+    mov             w6, w10
+    sub             x3, x3, #\a
+    lsl             x3, x3, #1
+    mov             w9, #\a
+    cmp             w9, #4
+    b.eq            14f
+    cmp             w9, #12
+    b.eq            15f
+    b               7f
+14:
​

x265_3.4.tar.gz/source/common/aarch64/ipfilter8.h Added

@@ -0,0 +1,55 @@
+/*****************************************************************************
+ * Copyright (C) 2020 MulticoreWare, Inc
+ *
+ * Authors: Yimeng Su <yimeng.su@huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_IPFILTER8_AARCH64_H
+#define X265_IPFILTER8_AARCH64_H
+
+
+void x265_interp_8tap_horiz_ps_4x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_4x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_4x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_8x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_8x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_8x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_8x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_12x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_16x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_16x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_16x12_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_16x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_16x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_16x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_24x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_32x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_32x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_32x24_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_32x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_32x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_48x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_64x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_64x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_64x48_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_64x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+
+
+#endif // ifndef X265_IPFILTER8_AARCH64_H

 
@@ -0,0 +1,55 @@
+/*****************************************************************************
+ * Copyright (C) 2020 MulticoreWare, Inc
+ *
+ * Authors: Yimeng Su <yimeng.su@huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_IPFILTER8_AARCH64_H
+#define X265_IPFILTER8_AARCH64_H
+
+
+void x265_interp_8tap_horiz_ps_4x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_4x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_4x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_8x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_8x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_8x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_8x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_12x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_16x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_16x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_16x12_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_16x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_16x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_16x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_24x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_32x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_32x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_32x24_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_32x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_32x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_48x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_64x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_64x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_64x48_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_64x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+
+
+#endif // ifndef X265_IPFILTER8_AARCH64_H
​

x265_3.4.tar.gz/source/common/aarch64/mc-a.S Added

@@ -0,0 +1,63 @@
+/*****************************************************************************
+ * Copyright (C) 2020 MulticoreWare, Inc
+ *
+ * Authors: Hongbin Liu <liuhongbin1@huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.section .rodata
+
+.align 4
+
+.text
+
+.macro pixel_avg_pp_4xN_neon h
+function x265_pixel_avg_pp_4x\h\()_neon
+.rept \h
+    ld1             {v0.s}[0], [x2], x3
+    ld1             {v1.s}[0], [x4], x5
+    urhadd          v2.8b, v0.8b, v1.8b
+    st1             {v2.s}[0], [x0], x1
+.endr
+    ret
+endfunc
+.endm
+
+pixel_avg_pp_4xN_neon 4
+pixel_avg_pp_4xN_neon 8
+pixel_avg_pp_4xN_neon 16
+
+.macro pixel_avg_pp_8xN_neon h
+function x265_pixel_avg_pp_8x\h\()_neon
+.rept \h
+    ld1             {v0.8b}, [x2], x3
+    ld1             {v1.8b}, [x4], x5
+    urhadd          v2.8b, v0.8b, v1.8b
+    st1             {v2.8b}, [x0], x1
+.endr
+    ret
+endfunc
+.endm
+
+pixel_avg_pp_8xN_neon 4
+pixel_avg_pp_8xN_neon 8
+pixel_avg_pp_8xN_neon 16
+pixel_avg_pp_8xN_neon 32

 
@@ -0,0 +1,63 @@
+/*****************************************************************************
+ * Copyright (C) 2020 MulticoreWare, Inc
+ *
+ * Authors: Hongbin Liu <liuhongbin1@huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.section .rodata
+
+.align 4
+
+.text
+
+.macro pixel_avg_pp_4xN_neon h
+function x265_pixel_avg_pp_4x\h\()_neon
+.rept \h
+    ld1             {v0.s}[0], [x2], x3
+    ld1             {v1.s}[0], [x4], x5
+    urhadd          v2.8b, v0.8b, v1.8b
+    st1             {v2.s}[0], [x0], x1
+.endr
+    ret
+endfunc
+.endm
+
+pixel_avg_pp_4xN_neon 4
+pixel_avg_pp_4xN_neon 8
+pixel_avg_pp_4xN_neon 16
+
+.macro pixel_avg_pp_8xN_neon h
+function x265_pixel_avg_pp_8x\h\()_neon
+.rept \h
+    ld1             {v0.8b}, [x2], x3
+    ld1             {v1.8b}, [x4], x5
+    urhadd          v2.8b, v0.8b, v1.8b
+    st1             {v2.8b}, [x0], x1
+.endr
+    ret
+endfunc
+.endm
+
+pixel_avg_pp_8xN_neon 4
+pixel_avg_pp_8xN_neon 8
+pixel_avg_pp_8xN_neon 16
+pixel_avg_pp_8xN_neon 32
​

x265_3.4.tar.gz/source/common/aarch64/pixel-util.S Added

@@ -0,0 +1,419 @@
+/*****************************************************************************
+ * Copyright (C) 2020 MulticoreWare, Inc
+ *
+ * Authors: Yimeng Su <yimeng.su@huawei.com>
+ *          Hongbin Liu <liuhongbin1@huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.section .rodata
+
+.align 4
+
+.text
+
+.macro x265_satd_4x8_8x4_end_neon
+    add             v0.8h, v4.8h, v6.8h
+    add             v1.8h, v5.8h, v7.8h
+    sub             v2.8h, v4.8h, v6.8h
+    sub             v3.8h, v5.8h, v7.8h
+
+    trn1            v16.8h, v0.8h, v1.8h
+    trn2            v17.8h, v0.8h, v1.8h
+    add             v4.8h, v16.8h, v17.8h
+    trn1            v18.8h, v2.8h, v3.8h
+    trn2            v19.8h, v2.8h, v3.8h
+    sub             v5.8h, v16.8h, v17.8h
+    add             v6.8h, v18.8h, v19.8h
+    sub             v7.8h, v18.8h, v19.8h
+    trn1            v0.4s, v4.4s, v6.4s
+    trn2            v2.4s, v4.4s, v6.4s
+    abs             v0.8h, v0.8h
+    trn1            v1.4s, v5.4s, v7.4s
+    trn2            v3.4s, v5.4s, v7.4s
+    abs             v2.8h, v2.8h
+    abs             v1.8h, v1.8h
+    abs             v3.8h, v3.8h
+    umax            v0.8h, v0.8h, v2.8h
+    umax            v1.8h, v1.8h, v3.8h
+    add             v0.8h, v0.8h, v1.8h
+    uaddlv          s0, v0.8h
+.endm
+
+.macro pixel_satd_4x8_neon
+    ld1r             {v1.2s}, [x2], x3
+    ld1r            {v0.2s}, [x0], x1
+    ld1r            {v3.2s}, [x2], x3
+    ld1r            {v2.2s}, [x0], x1
+    ld1r            {v5.2s}, [x2], x3
+    ld1r            {v4.2s}, [x0], x1
+    ld1r            {v7.2s}, [x2], x3
+    ld1r            {v6.2s}, [x0], x1
+
+    ld1             {v1.s}[1], [x2], x3
+    ld1             {v0.s}[1], [x0], x1
+    usubl           v0.8h, v0.8b, v1.8b
+    ld1             {v3.s}[1], [x2], x3
+    ld1             {v2.s}[1], [x0], x1
+    usubl           v1.8h, v2.8b, v3.8b
+    ld1             {v5.s}[1], [x2], x3
+    ld1             {v4.s}[1], [x0], x1
+    usubl           v2.8h, v4.8b, v5.8b
+    ld1             {v7.s}[1], [x2], x3
+    add             v4.8h, v0.8h, v1.8h
+    sub             v5.8h, v0.8h, v1.8h
+    ld1             {v6.s}[1], [x0], x1
+    usubl           v3.8h, v6.8b, v7.8b
+    add         v6.8h, v2.8h, v3.8h
+    sub         v7.8h, v2.8h, v3.8h
+    x265_satd_4x8_8x4_end_neon
+.endm
+
+// template<int w, int h>
+// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
+function x265_pixel_satd_4x8_neon
+    pixel_satd_4x8_neon
+    mov               w0, v0.s[0]
+    ret
+endfunc
+
+// template<int w, int h>
+// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
+function x265_pixel_satd_4x16_neon
+    eor             w4, w4, w4
+    pixel_satd_4x8_neon
+    mov               w5, v0.s[0]
+    add             w4, w4, w5
+    pixel_satd_4x8_neon
+    mov               w5, v0.s[0]
+    add             w0, w5, w4
+    ret
+endfunc
+
+// template<int w, int h>
+// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
+function x265_pixel_satd_4x32_neon
+    eor             w4, w4, w4
+.rept 4
+    pixel_satd_4x8_neon
+    mov             w5, v0.s[0]
+    add             w4, w4, w5
+.endr
+    mov             w0, w4
+    ret
+endfunc
+
+// template<int w, int h>
+// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
+function x265_pixel_satd_12x16_neon
+    mov             x4, x0
+    mov             x5, x2
+    eor             w7, w7, w7
+    pixel_satd_4x8_neon
+    mov             w6, v0.s[0]
+    add             w7, w7, w6
+    pixel_satd_4x8_neon
+    mov             w6, v0.s[0]
+    add             w7, w7, w6
+
+    add             x0, x4, #4
+    add             x2, x5, #4
+    pixel_satd_4x8_neon
+    mov             w6, v0.s[0]
+    add             w7, w7, w6
+    pixel_satd_4x8_neon
+    mov             w6, v0.s[0]
+    add             w7, w7, w6
+
+    add             x0, x4, #8
+    add             x2, x5, #8
+    pixel_satd_4x8_neon
+    mov             w6, v0.s[0]
+    add             w7, w7, w6
+    pixel_satd_4x8_neon
+    mov             w6, v0.s[0]
+    add             w0, w7, w6
+    ret
+endfunc
+
+// template<int w, int h>
+// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
+function x265_pixel_satd_12x32_neon
+    mov             x4, x0
+    mov             x5, x2
+    eor             w7, w7, w7
+.rept 4
+    pixel_satd_4x8_neon
+    mov             w6, v0.s[0]
+    add             w7, w7, w6
+.endr
+
+    add             x0, x4, #4
+    add             x2, x5, #4
+.rept 4
+    pixel_satd_4x8_neon
+    mov             w6, v0.s[0]
+    add             w7, w7, w6
+.endr
+
+    add             x0, x4, #8
+    add             x2, x5, #8
+.rept 4
+    pixel_satd_4x8_neon
+    mov             w6, v0.s[0]
+    add             w7, w7, w6
+.endr
+
+    mov             w0, w7
+    ret
+endfunc
+
+// template<int w, int h>
+// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
+function x265_pixel_satd_8x8_neon
+    eor             w4, w4, w4
+    mov             x6, x0
+    mov             x7, x2
+    pixel_satd_4x8_neon
+    mov             w5, v0.s[0]
+    add             w4, w4, w5
+    add             x0, x6, #4
+    add             x2, x7, #4

 
@@ -0,0 +1,419 @@
+/*****************************************************************************
+ * Copyright (C) 2020 MulticoreWare, Inc
+ *
+ * Authors: Yimeng Su <yimeng.su@huawei.com>
+ *          Hongbin Liu <liuhongbin1@huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.section .rodata
+
+.align 4
+
+.text
+
+.macro x265_satd_4x8_8x4_end_neon
+    add             v0.8h, v4.8h, v6.8h
+    add             v1.8h, v5.8h, v7.8h
+    sub             v2.8h, v4.8h, v6.8h
+    sub             v3.8h, v5.8h, v7.8h
+
+    trn1            v16.8h, v0.8h, v1.8h
+    trn2            v17.8h, v0.8h, v1.8h
+    add             v4.8h, v16.8h, v17.8h
+    trn1            v18.8h, v2.8h, v3.8h
+    trn2            v19.8h, v2.8h, v3.8h
+    sub             v5.8h, v16.8h, v17.8h
+    add             v6.8h, v18.8h, v19.8h
+    sub             v7.8h, v18.8h, v19.8h
+    trn1            v0.4s, v4.4s, v6.4s
+    trn2            v2.4s, v4.4s, v6.4s
+    abs             v0.8h, v0.8h
+    trn1            v1.4s, v5.4s, v7.4s
+    trn2            v3.4s, v5.4s, v7.4s
+    abs             v2.8h, v2.8h
+    abs             v1.8h, v1.8h
+    abs             v3.8h, v3.8h
+    umax            v0.8h, v0.8h, v2.8h
+    umax            v1.8h, v1.8h, v3.8h
+    add             v0.8h, v0.8h, v1.8h
+    uaddlv          s0, v0.8h
+.endm
+
+.macro pixel_satd_4x8_neon
+    ld1r             {v1.2s}, [x2], x3
+    ld1r            {v0.2s}, [x0], x1
+    ld1r            {v3.2s}, [x2], x3
+    ld1r            {v2.2s}, [x0], x1
+    ld1r            {v5.2s}, [x2], x3
+    ld1r            {v4.2s}, [x0], x1
+    ld1r            {v7.2s}, [x2], x3
+    ld1r            {v6.2s}, [x0], x1
+
+    ld1             {v1.s}[1], [x2], x3
+    ld1             {v0.s}[1], [x0], x1
+    usubl           v0.8h, v0.8b, v1.8b
+    ld1             {v3.s}[1], [x2], x3
+    ld1             {v2.s}[1], [x0], x1
+    usubl           v1.8h, v2.8b, v3.8b
+    ld1             {v5.s}[1], [x2], x3
+    ld1             {v4.s}[1], [x0], x1
+    usubl           v2.8h, v4.8b, v5.8b
+    ld1             {v7.s}[1], [x2], x3
+    add             v4.8h, v0.8h, v1.8h
+    sub             v5.8h, v0.8h, v1.8h
+    ld1             {v6.s}[1], [x0], x1
+    usubl           v3.8h, v6.8b, v7.8b
+    add         v6.8h, v2.8h, v3.8h
+    sub         v7.8h, v2.8h, v3.8h
+    x265_satd_4x8_8x4_end_neon
+.endm
+
+// template<int w, int h>
+// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
+function x265_pixel_satd_4x8_neon
+    pixel_satd_4x8_neon
+    mov               w0, v0.s[0]
+    ret
+endfunc
+
+// template<int w, int h>
+// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
+function x265_pixel_satd_4x16_neon
+    eor             w4, w4, w4
+    pixel_satd_4x8_neon
+    mov               w5, v0.s[0]
+    add             w4, w4, w5
+    pixel_satd_4x8_neon
+    mov               w5, v0.s[0]
+    add             w0, w5, w4
+    ret
+endfunc
+
+// template<int w, int h>
+// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
+function x265_pixel_satd_4x32_neon
+    eor             w4, w4, w4
+.rept 4
+    pixel_satd_4x8_neon
+    mov             w5, v0.s[0]
+    add             w4, w4, w5
+.endr
+    mov             w0, w4
+    ret
+endfunc
+
+// template<int w, int h>
+// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
+function x265_pixel_satd_12x16_neon
+    mov             x4, x0
+    mov             x5, x2
+    eor             w7, w7, w7
+    pixel_satd_4x8_neon
+    mov             w6, v0.s[0]
+    add             w7, w7, w6
+    pixel_satd_4x8_neon
+    mov             w6, v0.s[0]
+    add             w7, w7, w6
+
+    add             x0, x4, #4
+    add             x2, x5, #4
+    pixel_satd_4x8_neon
+    mov             w6, v0.s[0]
+    add             w7, w7, w6
+    pixel_satd_4x8_neon
+    mov             w6, v0.s[0]
+    add             w7, w7, w6
+
+    add             x0, x4, #8
+    add             x2, x5, #8
+    pixel_satd_4x8_neon
+    mov             w6, v0.s[0]
+    add             w7, w7, w6
+    pixel_satd_4x8_neon
+    mov             w6, v0.s[0]
+    add             w0, w7, w6
+    ret
+endfunc
+
+// template<int w, int h>
+// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
+function x265_pixel_satd_12x32_neon
+    mov             x4, x0
+    mov             x5, x2
+    eor             w7, w7, w7
+.rept 4
+    pixel_satd_4x8_neon
+    mov             w6, v0.s[0]
+    add             w7, w7, w6
+.endr
+
+    add             x0, x4, #4
+    add             x2, x5, #4
+.rept 4
+    pixel_satd_4x8_neon
+    mov             w6, v0.s[0]
+    add             w7, w7, w6
+.endr
+
+    add             x0, x4, #8
+    add             x2, x5, #8
+.rept 4
+    pixel_satd_4x8_neon
+    mov             w6, v0.s[0]
+    add             w7, w7, w6
+.endr
+
+    mov             w0, w7
+    ret
+endfunc
+
+// template<int w, int h>
+// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
+function x265_pixel_satd_8x8_neon
+    eor             w4, w4, w4
+    mov             x6, x0
+    mov             x7, x2
+    pixel_satd_4x8_neon
+    mov             w5, v0.s[0]
+    add             w4, w4, w5
+    add             x0, x6, #4
+    add             x2, x7, #4
​

x265_3.4.tar.gz/source/common/aarch64/pixel-util.h Added

@@ -0,0 +1,40 @@
+/*****************************************************************************
+ * Copyright (C) 2020 MulticoreWare, Inc
+ *
+ * Authors: Yimeng Su <yimeng.su@huawei.com>
+ *          Hongbin Liu <liuhongbin1@huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_PIXEL_UTIL_AARCH64_H
+#define X265_PIXEL_UTIL_AARCH64_H
+
+int x265_pixel_satd_4x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_4x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_4x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_4x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_8x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_8x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_12x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_12x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+
+uint32_t x265_quant_neon(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);
+int PFX(psyCost_4x4_neon)(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
+
+#endif // ifndef X265_PIXEL_UTIL_AARCH64_H

 
@@ -0,0 +1,40 @@
+/*****************************************************************************
+ * Copyright (C) 2020 MulticoreWare, Inc
+ *
+ * Authors: Yimeng Su <yimeng.su@huawei.com>
+ *          Hongbin Liu <liuhongbin1@huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_PIXEL_UTIL_AARCH64_H
+#define X265_PIXEL_UTIL_AARCH64_H
+
+int x265_pixel_satd_4x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_4x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_4x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_4x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_8x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_8x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_12x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_12x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+
+uint32_t x265_quant_neon(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);
+int PFX(psyCost_4x4_neon)(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
+
+#endif // ifndef X265_PIXEL_UTIL_AARCH64_H
​

x265_3.4.tar.gz/source/common/aarch64/pixel.h Added

@@ -0,0 +1,105 @@
+/*****************************************************************************
+ * Copyright (C) 2020 MulticoreWare, Inc
+ *
+ * Authors: Hongbin Liu <liuhongbin1@huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_I386_PIXEL_AARCH64_H
+#define X265_I386_PIXEL_AARCH64_H
+
+void x265_pixel_avg_pp_4x4_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_4x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_4x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_8x4_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_8x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_8x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_8x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_12x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_16x4_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_16x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_16x12_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_16x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_16x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_16x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_24x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_32x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_32x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_32x24_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_32x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_32x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_48x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_64x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_64x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_64x48_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_64x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+
+void x265_sad_x3_4x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_4x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_4x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_8x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_8x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_8x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_8x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_12x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_16x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_16x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_16x12_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_16x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_16x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_16x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_24x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_32x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_32x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_32x24_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_32x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_32x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_48x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_64x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_64x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_64x48_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_64x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+
+void x265_sad_x4_4x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_4x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_4x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_8x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_8x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_8x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_8x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_12x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_16x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_16x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_16x12_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_16x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_16x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_16x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_24x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_32x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_32x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_32x24_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_32x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_32x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_48x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_64x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_64x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_64x48_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_64x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+
+#endif // ifndef X265_I386_PIXEL_AARCH64_H

 
@@ -0,0 +1,105 @@
+/*****************************************************************************
+ * Copyright (C) 2020 MulticoreWare, Inc
+ *
+ * Authors: Hongbin Liu <liuhongbin1@huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_I386_PIXEL_AARCH64_H
+#define X265_I386_PIXEL_AARCH64_H
+
+void x265_pixel_avg_pp_4x4_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_4x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_4x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_8x4_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_8x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_8x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_8x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_12x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_16x4_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_16x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_16x12_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_16x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_16x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_16x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_24x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_32x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_32x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_32x24_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_32x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_32x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_48x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_64x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_64x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_64x48_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_64x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+
+void x265_sad_x3_4x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_4x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_4x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_8x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_8x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_8x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_8x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_12x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_16x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_16x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_16x12_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_16x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_16x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_16x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_24x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_32x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_32x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_32x24_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_32x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_32x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_48x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_64x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_64x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_64x48_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_64x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+
+void x265_sad_x4_4x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_4x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_4x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_8x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_8x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_8x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_8x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_12x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_16x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_16x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_16x12_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_16x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_16x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_16x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_24x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_32x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_32x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_32x24_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_32x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_32x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_48x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_64x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_64x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_64x48_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_64x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+
+#endif // ifndef X265_I386_PIXEL_AARCH64_H
​

x265_3.4.tar.gz/source/common/aarch64/sad-a.S Added

 
@@ -0,0 +1,105 @@
+/*****************************************************************************
+ * Copyright (C) 2020 MulticoreWare, Inc
+ *
+ * Authors: Hongbin Liu <liuhongbin1@huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.section .rodata
+
+.align 4
+
+.text
+
+.macro SAD_X_START_8 x
+    ld1             {v0.8b}, [x0], x9
+.if \x == 3
+    ld1             {v1.8b}, [x1], x4
+    ld1             {v2.8b}, [x2], x4
+    ld1             {v3.8b}, [x3], x4
+.elseif \x == 4
+    ld1             {v1.8b}, [x1], x5
+    ld1             {v2.8b}, [x2], x5
+    ld1             {v3.8b}, [x3], x5
+    ld1             {v4.8b}, [x4], x5
+.endif
+    uabdl           v16.8h, v0.8b, v1.8b
+    uabdl           v17.8h, v0.8b, v2.8b
+    uabdl           v18.8h, v0.8b, v3.8b
+.if \x == 4
+    uabdl           v19.8h, v0.8b, v4.8b
+.endif
+.endm
+
+.macro SAD_X_8 x
+    ld1             {v0.8b}, [x0], x9
+.if \x == 3
+    ld1             {v1.8b}, [x1], x4
+    ld1             {v2.8b}, [x2], x4
+    ld1             {v3.8b}, [x3], x4
+.elseif \x == 4
+    ld1             {v1.8b}, [x1], x5
+    ld1             {v2.8b}, [x2], x5
+    ld1             {v3.8b}, [x3], x5
+    ld1             {v4.8b}, [x4], x5
+.endif
+    uabal           v16.8h, v0.8b, v1.8b
+    uabal           v17.8h, v0.8b, v2.8b
+    uabal           v18.8h, v0.8b, v3.8b
+.if \x == 4
+    uabal           v19.8h, v0.8b, v4.8b
+.endif
+.endm
+
+.macro SAD_X_8xN x, h
+function x265_sad_x\x\()_8x\h\()_neon
+    mov             x9, #FENC_STRIDE
+    SAD_X_START_8 \x
+.rept \h - 1
+    SAD_X_8 \x
+.endr
+    uaddlv          s0, v16.8h
+    uaddlv          s1, v17.8h
+    uaddlv          s2, v18.8h
+.if \x == 4
+    uaddlv          s3, v19.8h
+.endif
+
+.if \x == 3
+    stp             s0, s1, [x5]
+    str             s2, [x5, #8]
+.elseif \x == 4
+    stp             s0, s1, [x6]
+    stp             s2, s3, [x6, #8]
+.endif
+    ret
+endfunc
+.endm
+
+SAD_X_8xN 3 4
+SAD_X_8xN 3 8
+SAD_X_8xN 3 16
+SAD_X_8xN 3 32
+
+SAD_X_8xN 4 4
+SAD_X_8xN 4 8
+SAD_X_8xN 4 16
+SAD_X_8xN 4 32
​

x265_3.3.tar.gz/source/common/arm/asm-primitives.cpp -> x265_3.4.tar.gz/source/common/arm/asm-primitives.cpp Changed

@@ -5,6 +5,7 @@
  *          Praveen Kumar Tiwari <praveen@multicorewareinc.com>
  *          Min Chen <chenm003@163.com> <min.chen@multicorewareinc.com>
  *          Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com>
+ *          Hongbin Liu<liuhongbin1@huawei.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -48,77 +49,77 @@
         p.ssim_4x4x2_core = PFX(ssim_4x4x2_core_neon);
 
         // addAvg
-         p.pu[LUMA_4x4].addAvg   = PFX(addAvg_4x4_neon);
-         p.pu[LUMA_4x8].addAvg   = PFX(addAvg_4x8_neon);
-         p.pu[LUMA_4x16].addAvg  = PFX(addAvg_4x16_neon);
-         p.pu[LUMA_8x4].addAvg   = PFX(addAvg_8x4_neon);
-         p.pu[LUMA_8x8].addAvg   = PFX(addAvg_8x8_neon);
-         p.pu[LUMA_8x16].addAvg  = PFX(addAvg_8x16_neon);
-         p.pu[LUMA_8x32].addAvg  = PFX(addAvg_8x32_neon);
-         p.pu[LUMA_12x16].addAvg = PFX(addAvg_12x16_neon);
-         p.pu[LUMA_16x4].addAvg  = PFX(addAvg_16x4_neon);
-         p.pu[LUMA_16x8].addAvg  = PFX(addAvg_16x8_neon);
-         p.pu[LUMA_16x12].addAvg = PFX(addAvg_16x12_neon);
-         p.pu[LUMA_16x16].addAvg = PFX(addAvg_16x16_neon);
-         p.pu[LUMA_16x32].addAvg = PFX(addAvg_16x32_neon);
-         p.pu[LUMA_16x64].addAvg = PFX(addAvg_16x64_neon);
-         p.pu[LUMA_24x32].addAvg = PFX(addAvg_24x32_neon);
-         p.pu[LUMA_32x8].addAvg  = PFX(addAvg_32x8_neon);
-         p.pu[LUMA_32x16].addAvg = PFX(addAvg_32x16_neon);
-         p.pu[LUMA_32x24].addAvg = PFX(addAvg_32x24_neon);
-         p.pu[LUMA_32x32].addAvg = PFX(addAvg_32x32_neon);
-         p.pu[LUMA_32x64].addAvg = PFX(addAvg_32x64_neon);
-         p.pu[LUMA_48x64].addAvg = PFX(addAvg_48x64_neon);
-         p.pu[LUMA_64x16].addAvg = PFX(addAvg_64x16_neon);
-         p.pu[LUMA_64x32].addAvg = PFX(addAvg_64x32_neon);
-         p.pu[LUMA_64x48].addAvg = PFX(addAvg_64x48_neon);
-         p.pu[LUMA_64x64].addAvg = PFX(addAvg_64x64_neon);
+         p.pu[LUMA_4x4].addAvg[NONALIGNED]   = PFX(addAvg_4x4_neon);
+         p.pu[LUMA_4x8].addAvg[NONALIGNED]   = PFX(addAvg_4x8_neon);
+         p.pu[LUMA_4x16].addAvg[NONALIGNED]  = PFX(addAvg_4x16_neon);
+         p.pu[LUMA_8x4].addAvg[NONALIGNED]   = PFX(addAvg_8x4_neon);
+         p.pu[LUMA_8x8].addAvg[NONALIGNED]   = PFX(addAvg_8x8_neon);
+         p.pu[LUMA_8x16].addAvg[NONALIGNED]  = PFX(addAvg_8x16_neon);
+         p.pu[LUMA_8x32].addAvg[NONALIGNED]  = PFX(addAvg_8x32_neon);
+         p.pu[LUMA_12x16].addAvg[NONALIGNED] = PFX(addAvg_12x16_neon);
+         p.pu[LUMA_16x4].addAvg[NONALIGNED]  = PFX(addAvg_16x4_neon);
+         p.pu[LUMA_16x8].addAvg[NONALIGNED]  = PFX(addAvg_16x8_neon);
+         p.pu[LUMA_16x12].addAvg[NONALIGNED] = PFX(addAvg_16x12_neon);
+         p.pu[LUMA_16x16].addAvg[NONALIGNED] = PFX(addAvg_16x16_neon);
+         p.pu[LUMA_16x32].addAvg[NONALIGNED] = PFX(addAvg_16x32_neon);
+         p.pu[LUMA_16x64].addAvg[NONALIGNED] = PFX(addAvg_16x64_neon);
+         p.pu[LUMA_24x32].addAvg[NONALIGNED] = PFX(addAvg_24x32_neon);
+         p.pu[LUMA_32x8].addAvg[NONALIGNED]  = PFX(addAvg_32x8_neon);
+         p.pu[LUMA_32x16].addAvg[NONALIGNED] = PFX(addAvg_32x16_neon);
+         p.pu[LUMA_32x24].addAvg[NONALIGNED] = PFX(addAvg_32x24_neon);
+         p.pu[LUMA_32x32].addAvg[NONALIGNED] = PFX(addAvg_32x32_neon);
+         p.pu[LUMA_32x64].addAvg[NONALIGNED] = PFX(addAvg_32x64_neon);
+         p.pu[LUMA_48x64].addAvg[NONALIGNED] = PFX(addAvg_48x64_neon);
+         p.pu[LUMA_64x16].addAvg[NONALIGNED] = PFX(addAvg_64x16_neon);
+         p.pu[LUMA_64x32].addAvg[NONALIGNED] = PFX(addAvg_64x32_neon);
+         p.pu[LUMA_64x48].addAvg[NONALIGNED] = PFX(addAvg_64x48_neon);
+         p.pu[LUMA_64x64].addAvg[NONALIGNED] = PFX(addAvg_64x64_neon);
 
         // chroma addAvg
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].addAvg   = PFX(addAvg_4x2_neon);
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].addAvg   = PFX(addAvg_4x4_neon);
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].addAvg   = PFX(addAvg_4x8_neon);
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].addAvg  = PFX(addAvg_4x16_neon);
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].addAvg   = PFX(addAvg_6x8_neon);
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].addAvg   = PFX(addAvg_8x2_neon);
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].addAvg   = PFX(addAvg_8x4_neon);
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].addAvg   = PFX(addAvg_8x6_neon);
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].addAvg   = PFX(addAvg_8x8_neon);
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].addAvg  = PFX(addAvg_8x16_neon);
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].addAvg  = PFX(addAvg_8x32_neon);
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].addAvg = PFX(addAvg_12x16_neon);
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].addAvg  = PFX(addAvg_16x4_neon);
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].addAvg  = PFX(addAvg_16x8_neon);
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].addAvg = PFX(addAvg_16x12_neon);
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].addAvg = PFX(addAvg_16x16_neon);
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].addAvg = PFX(addAvg_16x32_neon);
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].addAvg = PFX(addAvg_24x32_neon);
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].addAvg  = PFX(addAvg_32x8_neon);
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].addAvg = PFX(addAvg_32x16_neon);
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].addAvg = PFX(addAvg_32x24_neon);
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].addAvg = PFX(addAvg_32x32_neon);
-
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].addAvg   = PFX(addAvg_4x8_neon);
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].addAvg  = PFX(addAvg_4x16_neon);
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].addAvg  = PFX(addAvg_4x32_neon);
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].addAvg  = PFX(addAvg_6x16_neon);
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].addAvg   = PFX(addAvg_8x4_neon);
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].addAvg   = PFX(addAvg_8x8_neon);
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].addAvg  = PFX(addAvg_8x12_neon);
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].addAvg  = PFX(addAvg_8x16_neon);
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].addAvg  = PFX(addAvg_8x32_neon);
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].addAvg  = PFX(addAvg_8x64_neon);
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].addAvg = PFX(addAvg_12x32_neon);
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].addAvg  = PFX(addAvg_16x8_neon);
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].addAvg = PFX(addAvg_16x16_neon);
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].addAvg = PFX(addAvg_16x24_neon);
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].addAvg = PFX(addAvg_16x32_neon);
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].addAvg = PFX(addAvg_16x64_neon);
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].addAvg = PFX(addAvg_24x64_neon);
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].addAvg = PFX(addAvg_32x16_neon);
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].addAvg = PFX(addAvg_32x32_neon);
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].addAvg = PFX(addAvg_32x48_neon);
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].addAvg = PFX(addAvg_32x64_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].addAvg[NONALIGNED]   = PFX(addAvg_4x2_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].addAvg[NONALIGNED]   = PFX(addAvg_4x4_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].addAvg[NONALIGNED]   = PFX(addAvg_4x8_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].addAvg[NONALIGNED]  = PFX(addAvg_4x16_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].addAvg[NONALIGNED]   = PFX(addAvg_6x8_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].addAvg[NONALIGNED]   = PFX(addAvg_8x2_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].addAvg[NONALIGNED]   = PFX(addAvg_8x4_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].addAvg[NONALIGNED]   = PFX(addAvg_8x6_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].addAvg[NONALIGNED]   = PFX(addAvg_8x8_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].addAvg[NONALIGNED]  = PFX(addAvg_8x16_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].addAvg[NONALIGNED]  = PFX(addAvg_8x32_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].addAvg[NONALIGNED] = PFX(addAvg_12x16_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].addAvg[NONALIGNED]  = PFX(addAvg_16x4_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].addAvg[NONALIGNED]  = PFX(addAvg_16x8_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].addAvg[NONALIGNED] = PFX(addAvg_16x12_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].addAvg[NONALIGNED] = PFX(addAvg_16x16_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].addAvg[NONALIGNED] = PFX(addAvg_16x32_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].addAvg[NONALIGNED] = PFX(addAvg_24x32_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].addAvg[NONALIGNED]  = PFX(addAvg_32x8_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].addAvg[NONALIGNED] = PFX(addAvg_32x16_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].addAvg[NONALIGNED] = PFX(addAvg_32x24_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].addAvg[NONALIGNED] = PFX(addAvg_32x32_neon);
+
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].addAvg[NONALIGNED]   = PFX(addAvg_4x8_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].addAvg[NONALIGNED]  = PFX(addAvg_4x16_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].addAvg[NONALIGNED]  = PFX(addAvg_4x32_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].addAvg[NONALIGNED]  = PFX(addAvg_6x16_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].addAvg[NONALIGNED]   = PFX(addAvg_8x4_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].addAvg[NONALIGNED]   = PFX(addAvg_8x8_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].addAvg[NONALIGNED]  = PFX(addAvg_8x12_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].addAvg[NONALIGNED]  = PFX(addAvg_8x16_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].addAvg[NONALIGNED]  = PFX(addAvg_8x32_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].addAvg[NONALIGNED]  = PFX(addAvg_8x64_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].addAvg[NONALIGNED] = PFX(addAvg_12x32_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].addAvg[NONALIGNED]  = PFX(addAvg_16x8_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].addAvg[NONALIGNED] = PFX(addAvg_16x16_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].addAvg[NONALIGNED] = PFX(addAvg_16x24_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].addAvg[NONALIGNED] = PFX(addAvg_16x32_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].addAvg[NONALIGNED] = PFX(addAvg_16x64_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].addAvg[NONALIGNED] = PFX(addAvg_24x64_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].addAvg[NONALIGNED] = PFX(addAvg_32x16_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].addAvg[NONALIGNED] = PFX(addAvg_32x32_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].addAvg[NONALIGNED] = PFX(addAvg_32x48_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].addAvg[NONALIGNED] = PFX(addAvg_32x64_neon);
 
         // quant
          p.quant = PFX(quant_neon);
@@ -402,7 +403,7 @@
         p.scale2D_64to32  = PFX(scale2D_64to32_neon);
 
         // scale1D_128to64
-        p.scale1D_128to64 = PFX(scale1D_128to64_neon);
+        p.scale1D_128to64[NONALIGNED] = PFX(scale1D_128to64_neon);
 
         // copy_count
         p.cu[BLOCK_4x4].copy_cnt     = PFX(copy_cnt_4_neon);
@@ -411,37 +412,37 @@
         p.cu[BLOCK_32x32].copy_cnt   = PFX(copy_cnt_32_neon);
 
         // filterPixelToShort
-        p.pu[LUMA_4x4].convert_p2s   = PFX(filterPixelToShort_4x4_neon);
-        p.pu[LUMA_4x8].convert_p2s   = PFX(filterPixelToShort_4x8_neon);
-        p.pu[LUMA_4x16].convert_p2s  = PFX(filterPixelToShort_4x16_neon);
-        p.pu[LUMA_8x4].convert_p2s   = PFX(filterPixelToShort_8x4_neon);
-        p.pu[LUMA_8x8].convert_p2s   = PFX(filterPixelToShort_8x8_neon);
-        p.pu[LUMA_8x16].convert_p2s  = PFX(filterPixelToShort_8x16_neon);
-        p.pu[LUMA_8x32].convert_p2s  = PFX(filterPixelToShort_8x32_neon);
-        p.pu[LUMA_12x16].convert_p2s = PFX(filterPixelToShort_12x16_neon);
-        p.pu[LUMA_16x4].convert_p2s  = PFX(filterPixelToShort_16x4_neon);
-        p.pu[LUMA_16x8].convert_p2s  = PFX(filterPixelToShort_16x8_neon);
-        p.pu[LUMA_16x12].convert_p2s = PFX(filterPixelToShort_16x12_neon);
-        p.pu[LUMA_16x16].convert_p2s = PFX(filterPixelToShort_16x16_neon);
-        p.pu[LUMA_16x32].convert_p2s = PFX(filterPixelToShort_16x32_neon);
-        p.pu[LUMA_16x64].convert_p2s = PFX(filterPixelToShort_16x64_neon);
-        p.pu[LUMA_24x32].convert_p2s = PFX(filterPixelToShort_24x32_neon);
-        p.pu[LUMA_32x8].convert_p2s  = PFX(filterPixelToShort_32x8_neon);
-        p.pu[LUMA_32x16].convert_p2s = PFX(filterPixelToShort_32x16_neon);
-        p.pu[LUMA_32x24].convert_p2s = PFX(filterPixelToShort_32x24_neon);
-        p.pu[LUMA_32x32].convert_p2s = PFX(filterPixelToShort_32x32_neon);
-        p.pu[LUMA_32x64].convert_p2s = PFX(filterPixelToShort_32x64_neon);
-        p.pu[LUMA_48x64].convert_p2s = PFX(filterPixelToShort_48x64_neon);
-        p.pu[LUMA_64x16].convert_p2s = PFX(filterPixelToShort_64x16_neon);
-        p.pu[LUMA_64x32].convert_p2s = PFX(filterPixelToShort_64x32_neon);
-        p.pu[LUMA_64x48].convert_p2s = PFX(filterPixelToShort_64x48_neon);
-        p.pu[LUMA_64x64].convert_p2s = PFX(filterPixelToShort_64x64_neon);
+        p.pu[LUMA_4x4].convert_p2s[NONALIGNED]   = PFX(filterPixelToShort_4x4_neon);
+        p.pu[LUMA_4x8].convert_p2s[NONALIGNED]   = PFX(filterPixelToShort_4x8_neon);
+        p.pu[LUMA_4x16].convert_p2s[NONALIGNED]  = PFX(filterPixelToShort_4x16_neon);
+        p.pu[LUMA_8x4].convert_p2s[NONALIGNED]   = PFX(filterPixelToShort_8x4_neon);
+        p.pu[LUMA_8x8].convert_p2s[NONALIGNED]   = PFX(filterPixelToShort_8x8_neon);
+        p.pu[LUMA_8x16].convert_p2s[NONALIGNED]  = PFX(filterPixelToShort_8x16_neon);
+        p.pu[LUMA_8x32].convert_p2s[NONALIGNED]  = PFX(filterPixelToShort_8x32_neon);

 
@@ -5,6 +5,7 @@
  *          Praveen Kumar Tiwari <praveen@multicorewareinc.com>
  *          Min Chen <chenm003@163.com> <min.chen@multicorewareinc.com>
  *          Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com>
+ *          Hongbin Liu<liuhongbin1@huawei.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -48,77 +49,77 @@
         p.ssim_4x4x2_core = PFX(ssim_4x4x2_core_neon);
 
         // addAvg
-         p.pu[LUMA_4x4].addAvg   = PFX(addAvg_4x4_neon);
-         p.pu[LUMA_4x8].addAvg   = PFX(addAvg_4x8_neon);
-         p.pu[LUMA_4x16].addAvg  = PFX(addAvg_4x16_neon);
-         p.pu[LUMA_8x4].addAvg   = PFX(addAvg_8x4_neon);
-         p.pu[LUMA_8x8].addAvg   = PFX(addAvg_8x8_neon);
-         p.pu[LUMA_8x16].addAvg  = PFX(addAvg_8x16_neon);
-         p.pu[LUMA_8x32].addAvg  = PFX(addAvg_8x32_neon);
-         p.pu[LUMA_12x16].addAvg = PFX(addAvg_12x16_neon);
-         p.pu[LUMA_16x4].addAvg  = PFX(addAvg_16x4_neon);
-         p.pu[LUMA_16x8].addAvg  = PFX(addAvg_16x8_neon);
-         p.pu[LUMA_16x12].addAvg = PFX(addAvg_16x12_neon);
-         p.pu[LUMA_16x16].addAvg = PFX(addAvg_16x16_neon);
-         p.pu[LUMA_16x32].addAvg = PFX(addAvg_16x32_neon);
-         p.pu[LUMA_16x64].addAvg = PFX(addAvg_16x64_neon);
-         p.pu[LUMA_24x32].addAvg = PFX(addAvg_24x32_neon);
-         p.pu[LUMA_32x8].addAvg  = PFX(addAvg_32x8_neon);
-         p.pu[LUMA_32x16].addAvg = PFX(addAvg_32x16_neon);
-         p.pu[LUMA_32x24].addAvg = PFX(addAvg_32x24_neon);
-         p.pu[LUMA_32x32].addAvg = PFX(addAvg_32x32_neon);
-         p.pu[LUMA_32x64].addAvg = PFX(addAvg_32x64_neon);
-         p.pu[LUMA_48x64].addAvg = PFX(addAvg_48x64_neon);
-         p.pu[LUMA_64x16].addAvg = PFX(addAvg_64x16_neon);
-         p.pu[LUMA_64x32].addAvg = PFX(addAvg_64x32_neon);
-         p.pu[LUMA_64x48].addAvg = PFX(addAvg_64x48_neon);
-         p.pu[LUMA_64x64].addAvg = PFX(addAvg_64x64_neon);
+         p.pu[LUMA_4x4].addAvg[NONALIGNED]   = PFX(addAvg_4x4_neon);
+         p.pu[LUMA_4x8].addAvg[NONALIGNED]   = PFX(addAvg_4x8_neon);
+         p.pu[LUMA_4x16].addAvg[NONALIGNED]  = PFX(addAvg_4x16_neon);
+         p.pu[LUMA_8x4].addAvg[NONALIGNED]   = PFX(addAvg_8x4_neon);
+         p.pu[LUMA_8x8].addAvg[NONALIGNED]   = PFX(addAvg_8x8_neon);
+         p.pu[LUMA_8x16].addAvg[NONALIGNED]  = PFX(addAvg_8x16_neon);
+         p.pu[LUMA_8x32].addAvg[NONALIGNED]  = PFX(addAvg_8x32_neon);
+         p.pu[LUMA_12x16].addAvg[NONALIGNED] = PFX(addAvg_12x16_neon);
+         p.pu[LUMA_16x4].addAvg[NONALIGNED]  = PFX(addAvg_16x4_neon);
+         p.pu[LUMA_16x8].addAvg[NONALIGNED]  = PFX(addAvg_16x8_neon);
+         p.pu[LUMA_16x12].addAvg[NONALIGNED] = PFX(addAvg_16x12_neon);
+         p.pu[LUMA_16x16].addAvg[NONALIGNED] = PFX(addAvg_16x16_neon);
+         p.pu[LUMA_16x32].addAvg[NONALIGNED] = PFX(addAvg_16x32_neon);
+         p.pu[LUMA_16x64].addAvg[NONALIGNED] = PFX(addAvg_16x64_neon);
+         p.pu[LUMA_24x32].addAvg[NONALIGNED] = PFX(addAvg_24x32_neon);
+         p.pu[LUMA_32x8].addAvg[NONALIGNED]  = PFX(addAvg_32x8_neon);
+         p.pu[LUMA_32x16].addAvg[NONALIGNED] = PFX(addAvg_32x16_neon);
+         p.pu[LUMA_32x24].addAvg[NONALIGNED] = PFX(addAvg_32x24_neon);
+         p.pu[LUMA_32x32].addAvg[NONALIGNED] = PFX(addAvg_32x32_neon);
+         p.pu[LUMA_32x64].addAvg[NONALIGNED] = PFX(addAvg_32x64_neon);
+         p.pu[LUMA_48x64].addAvg[NONALIGNED] = PFX(addAvg_48x64_neon);
+         p.pu[LUMA_64x16].addAvg[NONALIGNED] = PFX(addAvg_64x16_neon);
+         p.pu[LUMA_64x32].addAvg[NONALIGNED] = PFX(addAvg_64x32_neon);
+         p.pu[LUMA_64x48].addAvg[NONALIGNED] = PFX(addAvg_64x48_neon);
+         p.pu[LUMA_64x64].addAvg[NONALIGNED] = PFX(addAvg_64x64_neon);
 
         // chroma addAvg
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].addAvg   = PFX(addAvg_4x2_neon);
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].addAvg   = PFX(addAvg_4x4_neon);
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].addAvg   = PFX(addAvg_4x8_neon);
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].addAvg  = PFX(addAvg_4x16_neon);
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].addAvg   = PFX(addAvg_6x8_neon);
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].addAvg   = PFX(addAvg_8x2_neon);
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].addAvg   = PFX(addAvg_8x4_neon);
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].addAvg   = PFX(addAvg_8x6_neon);
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].addAvg   = PFX(addAvg_8x8_neon);
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].addAvg  = PFX(addAvg_8x16_neon);
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].addAvg  = PFX(addAvg_8x32_neon);
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].addAvg = PFX(addAvg_12x16_neon);
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].addAvg  = PFX(addAvg_16x4_neon);
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].addAvg  = PFX(addAvg_16x8_neon);
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].addAvg = PFX(addAvg_16x12_neon);
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].addAvg = PFX(addAvg_16x16_neon);
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].addAvg = PFX(addAvg_16x32_neon);
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].addAvg = PFX(addAvg_24x32_neon);
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].addAvg  = PFX(addAvg_32x8_neon);
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].addAvg = PFX(addAvg_32x16_neon);
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].addAvg = PFX(addAvg_32x24_neon);
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].addAvg = PFX(addAvg_32x32_neon);
-
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].addAvg   = PFX(addAvg_4x8_neon);
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].addAvg  = PFX(addAvg_4x16_neon);
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].addAvg  = PFX(addAvg_4x32_neon);
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].addAvg  = PFX(addAvg_6x16_neon);
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].addAvg   = PFX(addAvg_8x4_neon);
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].addAvg   = PFX(addAvg_8x8_neon);
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].addAvg  = PFX(addAvg_8x12_neon);
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].addAvg  = PFX(addAvg_8x16_neon);
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].addAvg  = PFX(addAvg_8x32_neon);
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].addAvg  = PFX(addAvg_8x64_neon);
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].addAvg = PFX(addAvg_12x32_neon);
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].addAvg  = PFX(addAvg_16x8_neon);
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].addAvg = PFX(addAvg_16x16_neon);
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].addAvg = PFX(addAvg_16x24_neon);
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].addAvg = PFX(addAvg_16x32_neon);
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].addAvg = PFX(addAvg_16x64_neon);
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].addAvg = PFX(addAvg_24x64_neon);
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].addAvg = PFX(addAvg_32x16_neon);
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].addAvg = PFX(addAvg_32x32_neon);
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].addAvg = PFX(addAvg_32x48_neon);
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].addAvg = PFX(addAvg_32x64_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].addAvg[NONALIGNED]   = PFX(addAvg_4x2_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].addAvg[NONALIGNED]   = PFX(addAvg_4x4_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].addAvg[NONALIGNED]   = PFX(addAvg_4x8_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].addAvg[NONALIGNED]  = PFX(addAvg_4x16_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].addAvg[NONALIGNED]   = PFX(addAvg_6x8_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].addAvg[NONALIGNED]   = PFX(addAvg_8x2_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].addAvg[NONALIGNED]   = PFX(addAvg_8x4_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].addAvg[NONALIGNED]   = PFX(addAvg_8x6_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].addAvg[NONALIGNED]   = PFX(addAvg_8x8_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].addAvg[NONALIGNED]  = PFX(addAvg_8x16_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].addAvg[NONALIGNED]  = PFX(addAvg_8x32_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].addAvg[NONALIGNED] = PFX(addAvg_12x16_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].addAvg[NONALIGNED]  = PFX(addAvg_16x4_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].addAvg[NONALIGNED]  = PFX(addAvg_16x8_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].addAvg[NONALIGNED] = PFX(addAvg_16x12_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].addAvg[NONALIGNED] = PFX(addAvg_16x16_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].addAvg[NONALIGNED] = PFX(addAvg_16x32_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].addAvg[NONALIGNED] = PFX(addAvg_24x32_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].addAvg[NONALIGNED]  = PFX(addAvg_32x8_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].addAvg[NONALIGNED] = PFX(addAvg_32x16_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].addAvg[NONALIGNED] = PFX(addAvg_32x24_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].addAvg[NONALIGNED] = PFX(addAvg_32x32_neon);
+
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].addAvg[NONALIGNED]   = PFX(addAvg_4x8_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].addAvg[NONALIGNED]  = PFX(addAvg_4x16_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].addAvg[NONALIGNED]  = PFX(addAvg_4x32_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].addAvg[NONALIGNED]  = PFX(addAvg_6x16_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].addAvg[NONALIGNED]   = PFX(addAvg_8x4_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].addAvg[NONALIGNED]   = PFX(addAvg_8x8_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].addAvg[NONALIGNED]  = PFX(addAvg_8x12_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].addAvg[NONALIGNED]  = PFX(addAvg_8x16_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].addAvg[NONALIGNED]  = PFX(addAvg_8x32_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].addAvg[NONALIGNED]  = PFX(addAvg_8x64_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].addAvg[NONALIGNED] = PFX(addAvg_12x32_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].addAvg[NONALIGNED]  = PFX(addAvg_16x8_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].addAvg[NONALIGNED] = PFX(addAvg_16x16_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].addAvg[NONALIGNED] = PFX(addAvg_16x24_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].addAvg[NONALIGNED] = PFX(addAvg_16x32_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].addAvg[NONALIGNED] = PFX(addAvg_16x64_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].addAvg[NONALIGNED] = PFX(addAvg_24x64_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].addAvg[NONALIGNED] = PFX(addAvg_32x16_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].addAvg[NONALIGNED] = PFX(addAvg_32x32_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].addAvg[NONALIGNED] = PFX(addAvg_32x48_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].addAvg[NONALIGNED] = PFX(addAvg_32x64_neon);
 
         // quant
          p.quant = PFX(quant_neon);
@@ -402,7 +403,7 @@
         p.scale2D_64to32  = PFX(scale2D_64to32_neon);
 
         // scale1D_128to64
-        p.scale1D_128to64 = PFX(scale1D_128to64_neon);
+        p.scale1D_128to64[NONALIGNED] = PFX(scale1D_128to64_neon);
 
         // copy_count
         p.cu[BLOCK_4x4].copy_cnt     = PFX(copy_cnt_4_neon);
@@ -411,37 +412,37 @@
         p.cu[BLOCK_32x32].copy_cnt   = PFX(copy_cnt_32_neon);
 
         // filterPixelToShort
-        p.pu[LUMA_4x4].convert_p2s   = PFX(filterPixelToShort_4x4_neon);
-        p.pu[LUMA_4x8].convert_p2s   = PFX(filterPixelToShort_4x8_neon);
-        p.pu[LUMA_4x16].convert_p2s  = PFX(filterPixelToShort_4x16_neon);
-        p.pu[LUMA_8x4].convert_p2s   = PFX(filterPixelToShort_8x4_neon);
-        p.pu[LUMA_8x8].convert_p2s   = PFX(filterPixelToShort_8x8_neon);
-        p.pu[LUMA_8x16].convert_p2s  = PFX(filterPixelToShort_8x16_neon);
-        p.pu[LUMA_8x32].convert_p2s  = PFX(filterPixelToShort_8x32_neon);
-        p.pu[LUMA_12x16].convert_p2s = PFX(filterPixelToShort_12x16_neon);
-        p.pu[LUMA_16x4].convert_p2s  = PFX(filterPixelToShort_16x4_neon);
-        p.pu[LUMA_16x8].convert_p2s  = PFX(filterPixelToShort_16x8_neon);
-        p.pu[LUMA_16x12].convert_p2s = PFX(filterPixelToShort_16x12_neon);
-        p.pu[LUMA_16x16].convert_p2s = PFX(filterPixelToShort_16x16_neon);
-        p.pu[LUMA_16x32].convert_p2s = PFX(filterPixelToShort_16x32_neon);
-        p.pu[LUMA_16x64].convert_p2s = PFX(filterPixelToShort_16x64_neon);
-        p.pu[LUMA_24x32].convert_p2s = PFX(filterPixelToShort_24x32_neon);
-        p.pu[LUMA_32x8].convert_p2s  = PFX(filterPixelToShort_32x8_neon);
-        p.pu[LUMA_32x16].convert_p2s = PFX(filterPixelToShort_32x16_neon);
-        p.pu[LUMA_32x24].convert_p2s = PFX(filterPixelToShort_32x24_neon);
-        p.pu[LUMA_32x32].convert_p2s = PFX(filterPixelToShort_32x32_neon);
-        p.pu[LUMA_32x64].convert_p2s = PFX(filterPixelToShort_32x64_neon);
-        p.pu[LUMA_48x64].convert_p2s = PFX(filterPixelToShort_48x64_neon);
-        p.pu[LUMA_64x16].convert_p2s = PFX(filterPixelToShort_64x16_neon);
-        p.pu[LUMA_64x32].convert_p2s = PFX(filterPixelToShort_64x32_neon);
-        p.pu[LUMA_64x48].convert_p2s = PFX(filterPixelToShort_64x48_neon);
-        p.pu[LUMA_64x64].convert_p2s = PFX(filterPixelToShort_64x64_neon);
+        p.pu[LUMA_4x4].convert_p2s[NONALIGNED]   = PFX(filterPixelToShort_4x4_neon);
+        p.pu[LUMA_4x8].convert_p2s[NONALIGNED]   = PFX(filterPixelToShort_4x8_neon);
+        p.pu[LUMA_4x16].convert_p2s[NONALIGNED]  = PFX(filterPixelToShort_4x16_neon);
+        p.pu[LUMA_8x4].convert_p2s[NONALIGNED]   = PFX(filterPixelToShort_8x4_neon);
+        p.pu[LUMA_8x8].convert_p2s[NONALIGNED]   = PFX(filterPixelToShort_8x8_neon);
+        p.pu[LUMA_8x16].convert_p2s[NONALIGNED]  = PFX(filterPixelToShort_8x16_neon);
+        p.pu[LUMA_8x32].convert_p2s[NONALIGNED]  = PFX(filterPixelToShort_8x32_neon);
​

x265_3.3.tar.gz/source/common/common.h -> x265_3.4.tar.gz/source/common/common.h Changed

 
@@ -129,6 +129,7 @@
 typedef uint64_t sum2_t;
 typedef uint64_t pixel4;
 typedef int64_t  ssum2_t;
+#define SHIFT_TO_BITPLANE 9
 #define HISTOGRAM_BINS 1024
 #else
 typedef uint8_t  pixel;
@@ -136,6 +137,7 @@
 typedef uint32_t sum2_t;
 typedef uint32_t pixel4;
 typedef int32_t  ssum2_t; // Signed sum
+#define SHIFT_TO_BITPLANE 7
 #define HISTOGRAM_BINS 256
 #endif // if HIGH_BIT_DEPTH
 
@@ -270,6 +272,9 @@
 #define MAX_TR_SIZE (1 << MAX_LOG2_TR_SIZE)
 #define MAX_TS_SIZE (1 << MAX_LOG2_TS_SIZE)
 
+#define RDCOST_BASED_RSKIP 1
+#define EDGE_BASED_RSKIP 2
+
 #define COEF_REMAIN_BIN_REDUCTION   3 // indicates the level at which the VLC
                                       // transitions from Golomb-Rice to TU+EG(k)
 
​

x265_3.3.tar.gz/source/common/cpu.cpp -> x265_3.4.tar.gz/source/common/cpu.cpp Changed

 
@@ -5,6 +5,8 @@
  *          Laurent Aimar <fenrir@via.ecp.fr>
  *          Fiona Glaser <fiona@x264.com>
  *          Steve Borho <steve@borho.org>
+ *          Hongbin Liu <liuhongbin1@huawei.com>
+ *          Yimeng Su <yimeng.su@huawei.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -367,6 +369,8 @@
     flags |= PFX(cpu_fast_neon_mrc_test)() ? X265_CPU_FAST_NEON_MRC : 0;
 #endif
     // TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc)
+#elif X265_ARCH_ARM64
+    flags |= X265_CPU_NEON;
 #endif // if HAVE_ARMV6
     return flags;
 }
​

x265_3.3.tar.gz/source/common/frame.cpp -> x265_3.4.tar.gz/source/common/frame.cpp Changed

@@ -61,6 +61,8 @@
     m_edgePic = NULL;
     m_gaussianPic = NULL;
     m_thetaPic = NULL;
+    m_edgeBitPlane = NULL;
+    m_edgeBitPic = NULL;
 }
 
 bool Frame::create(x265_param *param, float* quantOffsets)
@@ -115,6 +117,19 @@
         m_thetaPic = X265_MALLOC(pixel, m_stride * (maxHeight + (m_lumaMarginY * 2)));
     }
 
+    if (param->recursionSkipMode == EDGE_BASED_RSKIP)
+    {
+        uint32_t numCuInWidth = (param->sourceWidth + param->maxCUSize - 1) / param->maxCUSize;
+        uint32_t numCuInHeight = (param->sourceHeight + param->maxCUSize - 1) / param->maxCUSize;
+        uint32_t lumaMarginX = param->maxCUSize + 32;
+        uint32_t lumaMarginY = param->maxCUSize + 16;
+        uint32_t stride = (numCuInWidth * param->maxCUSize) + (lumaMarginX << 1);
+        uint32_t maxHeight = numCuInHeight * param->maxCUSize;
+        uint32_t bitPlaneSize = stride * (maxHeight + (lumaMarginY * 2));
+        CHECKED_MALLOC_ZERO(m_edgeBitPlane, pixel, bitPlaneSize);
+        m_edgeBitPic = m_edgeBitPlane + lumaMarginY * stride + lumaMarginX;
+    }
+
     if (m_fencPic->create(param, !!m_param->bCopyPicToFrame) && m_lowres.create(param, m_fencPic, param->rc.qgSize))
     {
         X265_CHECK((m_reconColCount == NULL), "m_reconColCount was initialized");
@@ -267,4 +282,10 @@
         X265_FREE(m_gaussianPic);
         X265_FREE(m_thetaPic);
     }
+
+    if (m_param->recursionSkipMode == EDGE_BASED_RSKIP)
+    {
+        X265_FREE_ZERO(m_edgeBitPlane);
+        m_edgeBitPic = NULL;
+    }
 }

 
@@ -61,6 +61,8 @@
     m_edgePic = NULL;
     m_gaussianPic = NULL;
     m_thetaPic = NULL;
+    m_edgeBitPlane = NULL;
+    m_edgeBitPic = NULL;
 }
 
 bool Frame::create(x265_param *param, float* quantOffsets)
@@ -115,6 +117,19 @@
         m_thetaPic = X265_MALLOC(pixel, m_stride * (maxHeight + (m_lumaMarginY * 2)));
     }
 
+    if (param->recursionSkipMode == EDGE_BASED_RSKIP)
+    {
+        uint32_t numCuInWidth = (param->sourceWidth + param->maxCUSize - 1) / param->maxCUSize;
+        uint32_t numCuInHeight = (param->sourceHeight + param->maxCUSize - 1) / param->maxCUSize;
+        uint32_t lumaMarginX = param->maxCUSize + 32;
+        uint32_t lumaMarginY = param->maxCUSize + 16;
+        uint32_t stride = (numCuInWidth * param->maxCUSize) + (lumaMarginX << 1);
+        uint32_t maxHeight = numCuInHeight * param->maxCUSize;
+        uint32_t bitPlaneSize = stride * (maxHeight + (lumaMarginY * 2));
+        CHECKED_MALLOC_ZERO(m_edgeBitPlane, pixel, bitPlaneSize);
+        m_edgeBitPic = m_edgeBitPlane + lumaMarginY * stride + lumaMarginX;
+    }
+
     if (m_fencPic->create(param, !!m_param->bCopyPicToFrame) && m_lowres.create(param, m_fencPic, param->rc.qgSize))
     {
         X265_CHECK((m_reconColCount == NULL), "m_reconColCount was initialized");
@@ -267,4 +282,10 @@
         X265_FREE(m_gaussianPic);
         X265_FREE(m_thetaPic);
     }
+
+    if (m_param->recursionSkipMode == EDGE_BASED_RSKIP)
+    {
+        X265_FREE_ZERO(m_edgeBitPlane);
+        m_edgeBitPic = NULL;
+    }
 }
​

x265_3.3.tar.gz/source/common/frame.h -> x265_3.4.tar.gz/source/common/frame.h Changed

 
@@ -99,7 +99,7 @@
     float*                 m_quantOffsets;       // points to quantOffsets in x265_picture
     x265_sei               m_userSEI;
     uint32_t               m_picStruct;          // picture structure SEI message
-    x265_dolby_vision_rpu            m_rpu;
+    x265_dolby_vision_rpu  m_rpu;
 
     /* Frame Parallelism - notification between FrameEncoders of available motion reference rows */
     ThreadSafeInteger*     m_reconRowFlag;       // flag of CTU rows completely reconstructed and extended for motion reference
@@ -137,6 +137,10 @@
     pixel*                 m_gaussianPic;
     pixel*                 m_thetaPic;
 
+    /* edge bit plane for rskips 2 and 3 */
+    pixel*                 m_edgeBitPlane;
+    pixel*                 m_edgeBitPic;
+
     Frame();
 
     bool create(x265_param *param, float* quantOffsets);
​

x265_3.3.tar.gz/source/common/param.cpp -> x265_3.4.tar.gz/source/common/param.cpp Changed

@@ -198,7 +198,8 @@
     param->bEnableWeightedPred = 1;
     param->bEnableWeightedBiPred = 0;
     param->bEnableEarlySkip = 1;
-    param->bEnableRecursionSkip = 1;
+    param->recursionSkipMode = 1;
+    param->edgeVarThreshold = 0.05f;
     param->bEnableAMP = 0;
     param->bEnableRectInter = 0;
     param->rdLevel = 3;
@@ -285,6 +286,7 @@
     param->rc.bEnableConstVbv = 0;
     param->bResetZoneConfig = 1;
     param->reconfigWindowSize = 0;
+    param->decoderVbvMaxRate = 0;
 
     /* Video Usability Information (VUI) */
     param->vui.aspectRatioIdc = 0;
@@ -546,7 +548,7 @@
             param->maxNumMergeCand = 5;
             param->searchMethod = X265_STAR_SEARCH;
             param->bEnableTransformSkip = 1;
-            param->bEnableRecursionSkip = 0;
+            param->recursionSkipMode = 0;
             param->maxNumReferences = 5;
             param->limitReferences = 0;
             param->lookaheadSlices = 0; // disabled for best quality
@@ -598,7 +600,7 @@
             param->rc.hevcAq = 0;
             param->rc.qpStep = 1;
             param->rc.bEnableGrain = 1;
-            param->bEnableRecursionSkip = 0;
+            param->recursionSkipMode = 0;
             param->psyRd = 4.0;
             param->psyRdoq = 10.0;
             param->bEnableSAO = 0;
@@ -702,8 +704,9 @@
     OPT("ref") p->maxNumReferences = atoi(value);
     OPT("fast-intra") p->bEnableFastIntra = atobool(value);
     OPT("early-skip") p->bEnableEarlySkip = atobool(value);
-    OPT("rskip") p->bEnableRecursionSkip = atobool(value);
-    OPT("me")p->searchMethod = parseName(value, x265_motion_est_names, bError);
+    OPT("rskip") p->recursionSkipMode = atoi(value);
+    OPT("rskip-edge-threshold") p->edgeVarThreshold = atoi(value)/100.0f;
+    OPT("me") p->searchMethod = parseName(value, x265_motion_est_names, bError);
     OPT("subme") p->subpelRefine = atoi(value);
     OPT("merange") p->searchRange = atoi(value);
     OPT("rect") p->bEnableRectInter = atobool(value);
@@ -919,7 +922,7 @@
     OPT("max-merge") p->maxNumMergeCand = (uint32_t)atoi(value);
     OPT("temporal-mvp") p->bEnableTemporalMvp = atobool(value);
     OPT("early-skip") p->bEnableEarlySkip = atobool(value);
-    OPT("rskip") p->bEnableRecursionSkip = atobool(value);
+    OPT("rskip") p->recursionSkipMode = atoi(value);
     OPT("rdpenalty") p->rdPenalty = atoi(value);
     OPT("tskip") p->bEnableTransformSkip = atobool(value);
     OPT("no-tskip-fast") p->bEnableTSkipFast = atobool(value);
@@ -1221,6 +1224,7 @@
             }
         }
         OPT("hist-threshold") p->edgeTransitionThreshold = atof(value);
+        OPT("rskip-edge-threshold") p->edgeVarThreshold = atoi(value)/100.0f;
         OPT("lookahead-threads") p->lookaheadThreads = atoi(value);
         OPT("opt-cu-delta-qp") p->bOptCUDeltaQP = atobool(value);
         OPT("multi-pass-opt-analysis") p->analysisMultiPassRefine = atobool(value);
@@ -1596,9 +1600,16 @@
     CHECK(param->rdLevel < 1 || param->rdLevel > 6,
           "RD Level is out of range");
     CHECK(param->rdoqLevel < 0 || param->rdoqLevel > 2,
-        "RDOQ Level is out of range");
+          "RDOQ Level is out of range");
     CHECK(param->dynamicRd < 0 || param->dynamicRd > x265_ADAPT_RD_STRENGTH,
-        "Dynamic RD strength must be between 0 and 4");
+          "Dynamic RD strength must be between 0 and 4");
+    CHECK(param->recursionSkipMode > 2 || param->recursionSkipMode < 0,
+          "Invalid Recursion skip mode. Valid modes 0,1,2");
+    if (param->recursionSkipMode == EDGE_BASED_RSKIP)
+    {
+        CHECK(param->edgeVarThreshold < 0.0f || param->edgeVarThreshold > 1.0f,
+              "Minimum edge density percentage for a CU should be an integer between 0 to 100");
+    }
     CHECK(param->bframes && param->bframes >= param->lookaheadDepth && !param->rc.bStatRead,
           "Lookahead depth must be greater than the max consecutive bframe count");
     CHECK(param->bframes < 0,
@@ -1789,6 +1800,7 @@
     }
     CHECK(param->confWinRightOffset < 0, "Conformance Window Right Offset must be 0 or greater");
     CHECK(param->confWinBottomOffset < 0, "Conformance Window Bottom Offset must be 0 or greater");
+    CHECK(param->decoderVbvMaxRate < 0, "Invalid Decoder Vbv Maxrate. Value can not be less than zero");
     return check_failed;
 }
 
@@ -1908,7 +1920,9 @@
     TOOLVAL(param->psyRdoq, "psy-rdoq=%.2lf");
     TOOLOPT(param->bEnableRdRefine, "rd-refine");
     TOOLOPT(param->bEnableEarlySkip, "early-skip");
-    TOOLOPT(param->bEnableRecursionSkip, "rskip");
+    TOOLVAL(param->recursionSkipMode, "rskip mode=%d");
+    if (param->recursionSkipMode == EDGE_BASED_RSKIP)
+        TOOLVAL(param->edgeVarThreshold, "rskip-edge-threshold=%.2f");
     TOOLOPT(param->bEnableSplitRdSkip, "splitrd-skip");
     TOOLVAL(param->noiseReductionIntra, "nr-intra=%d");
     TOOLVAL(param->noiseReductionInter, "nr-inter=%d");
@@ -2066,7 +2080,10 @@
     s += sprintf(s, " rd=%d", p->rdLevel);
     s += sprintf(s, " selective-sao=%d", p->selectiveSAO);
     BOOL(p->bEnableEarlySkip, "early-skip");
-    BOOL(p->bEnableRecursionSkip, "rskip");
+    BOOL(p->recursionSkipMode, "rskip");
+    if (p->recursionSkipMode == EDGE_BASED_RSKIP)
+        s += sprintf(s, " rskip-edge-threshold=%f", p->edgeVarThreshold);
+
     BOOL(p->bEnableFastIntra, "fast-intra");
     BOOL(p->bEnableTSkipFast, "tskip-fast");
     BOOL(p->bCULossless, "cu-lossless");
@@ -2204,6 +2221,7 @@
     if (p->bEnableSceneCutAwareQp)
         s += sprintf(s, " scenecut-window=%d max-qp-delta=%d", p->scenecutWindow, p->maxQpDelta);
     s += sprintf(s, "conformance-window-offsets right=%d bottom=%d", p->confWinRightOffset, p->confWinBottomOffset);
+    s += sprintf(s, " decoder-max-rate=%d", p->decoderVbvMaxRate);
 #undef BOOL
     return buf;
 }
@@ -2373,7 +2391,8 @@
     dst->bSaoNonDeblocked = src->bSaoNonDeblocked;
     dst->rdLevel = src->rdLevel;
     dst->bEnableEarlySkip = src->bEnableEarlySkip;
-    dst->bEnableRecursionSkip = src->bEnableRecursionSkip;
+    dst->recursionSkipMode = src->recursionSkipMode;
+    dst->edgeVarThreshold = src->edgeVarThreshold;
     dst->bEnableFastIntra = src->bEnableFastIntra;
     dst->bEnableTSkipFast = src->bEnableTSkipFast;
     dst->bCULossless = src->bCULossless;
@@ -2419,8 +2438,9 @@
     dst->rc.zonefileCount = src->rc.zonefileCount;
     dst->reconfigWindowSize = src->reconfigWindowSize;
     dst->bResetZoneConfig = src->bResetZoneConfig;
+    dst->decoderVbvMaxRate = src->decoderVbvMaxRate;
 
-    if (src->rc.zonefileCount && src->rc.zones)
+    if (src->rc.zonefileCount && src->rc.zones && src->bResetZoneConfig)
     {
         for (int i = 0; i < src->rc.zonefileCount; i++)
         {

 
@@ -198,7 +198,8 @@
     param->bEnableWeightedPred = 1;
     param->bEnableWeightedBiPred = 0;
     param->bEnableEarlySkip = 1;
-    param->bEnableRecursionSkip = 1;
+    param->recursionSkipMode = 1;
+    param->edgeVarThreshold = 0.05f;
     param->bEnableAMP = 0;
     param->bEnableRectInter = 0;
     param->rdLevel = 3;
@@ -285,6 +286,7 @@
     param->rc.bEnableConstVbv = 0;
     param->bResetZoneConfig = 1;
     param->reconfigWindowSize = 0;
+    param->decoderVbvMaxRate = 0;
 
     /* Video Usability Information (VUI) */
     param->vui.aspectRatioIdc = 0;
@@ -546,7 +548,7 @@
             param->maxNumMergeCand = 5;
             param->searchMethod = X265_STAR_SEARCH;
             param->bEnableTransformSkip = 1;
-            param->bEnableRecursionSkip = 0;
+            param->recursionSkipMode = 0;
             param->maxNumReferences = 5;
             param->limitReferences = 0;
             param->lookaheadSlices = 0; // disabled for best quality
@@ -598,7 +600,7 @@
             param->rc.hevcAq = 0;
             param->rc.qpStep = 1;
             param->rc.bEnableGrain = 1;
-            param->bEnableRecursionSkip = 0;
+            param->recursionSkipMode = 0;
             param->psyRd = 4.0;
             param->psyRdoq = 10.0;
             param->bEnableSAO = 0;
@@ -702,8 +704,9 @@
     OPT("ref") p->maxNumReferences = atoi(value);
     OPT("fast-intra") p->bEnableFastIntra = atobool(value);
     OPT("early-skip") p->bEnableEarlySkip = atobool(value);
-    OPT("rskip") p->bEnableRecursionSkip = atobool(value);
-    OPT("me")p->searchMethod = parseName(value, x265_motion_est_names, bError);
+    OPT("rskip") p->recursionSkipMode = atoi(value);
+    OPT("rskip-edge-threshold") p->edgeVarThreshold = atoi(value)/100.0f;
+    OPT("me") p->searchMethod = parseName(value, x265_motion_est_names, bError);
     OPT("subme") p->subpelRefine = atoi(value);
     OPT("merange") p->searchRange = atoi(value);
     OPT("rect") p->bEnableRectInter = atobool(value);
@@ -919,7 +922,7 @@
     OPT("max-merge") p->maxNumMergeCand = (uint32_t)atoi(value);
     OPT("temporal-mvp") p->bEnableTemporalMvp = atobool(value);
     OPT("early-skip") p->bEnableEarlySkip = atobool(value);
-    OPT("rskip") p->bEnableRecursionSkip = atobool(value);
+    OPT("rskip") p->recursionSkipMode = atoi(value);
     OPT("rdpenalty") p->rdPenalty = atoi(value);
     OPT("tskip") p->bEnableTransformSkip = atobool(value);
     OPT("no-tskip-fast") p->bEnableTSkipFast = atobool(value);
@@ -1221,6 +1224,7 @@
             }
         }
         OPT("hist-threshold") p->edgeTransitionThreshold = atof(value);
+        OPT("rskip-edge-threshold") p->edgeVarThreshold = atoi(value)/100.0f;
         OPT("lookahead-threads") p->lookaheadThreads = atoi(value);
         OPT("opt-cu-delta-qp") p->bOptCUDeltaQP = atobool(value);
         OPT("multi-pass-opt-analysis") p->analysisMultiPassRefine = atobool(value);
@@ -1596,9 +1600,16 @@
     CHECK(param->rdLevel < 1 || param->rdLevel > 6,
           "RD Level is out of range");
     CHECK(param->rdoqLevel < 0 || param->rdoqLevel > 2,
-        "RDOQ Level is out of range");
+          "RDOQ Level is out of range");
     CHECK(param->dynamicRd < 0 || param->dynamicRd > x265_ADAPT_RD_STRENGTH,
-        "Dynamic RD strength must be between 0 and 4");
+          "Dynamic RD strength must be between 0 and 4");
+    CHECK(param->recursionSkipMode > 2 || param->recursionSkipMode < 0,
+          "Invalid Recursion skip mode. Valid modes 0,1,2");
+    if (param->recursionSkipMode == EDGE_BASED_RSKIP)
+    {
+        CHECK(param->edgeVarThreshold < 0.0f || param->edgeVarThreshold > 1.0f,
+              "Minimum edge density percentage for a CU should be an integer between 0 to 100");
+    }
     CHECK(param->bframes && param->bframes >= param->lookaheadDepth && !param->rc.bStatRead,
           "Lookahead depth must be greater than the max consecutive bframe count");
     CHECK(param->bframes < 0,
@@ -1789,6 +1800,7 @@
     }
     CHECK(param->confWinRightOffset < 0, "Conformance Window Right Offset must be 0 or greater");
     CHECK(param->confWinBottomOffset < 0, "Conformance Window Bottom Offset must be 0 or greater");
+    CHECK(param->decoderVbvMaxRate < 0, "Invalid Decoder Vbv Maxrate. Value can not be less than zero");
     return check_failed;
 }
 
@@ -1908,7 +1920,9 @@
     TOOLVAL(param->psyRdoq, "psy-rdoq=%.2lf");
     TOOLOPT(param->bEnableRdRefine, "rd-refine");
     TOOLOPT(param->bEnableEarlySkip, "early-skip");
-    TOOLOPT(param->bEnableRecursionSkip, "rskip");
+    TOOLVAL(param->recursionSkipMode, "rskip mode=%d");
+    if (param->recursionSkipMode == EDGE_BASED_RSKIP)
+        TOOLVAL(param->edgeVarThreshold, "rskip-edge-threshold=%.2f");
     TOOLOPT(param->bEnableSplitRdSkip, "splitrd-skip");
     TOOLVAL(param->noiseReductionIntra, "nr-intra=%d");
     TOOLVAL(param->noiseReductionInter, "nr-inter=%d");
@@ -2066,7 +2080,10 @@
     s += sprintf(s, " rd=%d", p->rdLevel);
     s += sprintf(s, " selective-sao=%d", p->selectiveSAO);
     BOOL(p->bEnableEarlySkip, "early-skip");
-    BOOL(p->bEnableRecursionSkip, "rskip");
+    BOOL(p->recursionSkipMode, "rskip");
+    if (p->recursionSkipMode == EDGE_BASED_RSKIP)
+        s += sprintf(s, " rskip-edge-threshold=%f", p->edgeVarThreshold);
+
     BOOL(p->bEnableFastIntra, "fast-intra");
     BOOL(p->bEnableTSkipFast, "tskip-fast");
     BOOL(p->bCULossless, "cu-lossless");
@@ -2204,6 +2221,7 @@
     if (p->bEnableSceneCutAwareQp)
         s += sprintf(s, " scenecut-window=%d max-qp-delta=%d", p->scenecutWindow, p->maxQpDelta);
     s += sprintf(s, "conformance-window-offsets right=%d bottom=%d", p->confWinRightOffset, p->confWinBottomOffset);
+    s += sprintf(s, " decoder-max-rate=%d", p->decoderVbvMaxRate);
 #undef BOOL
     return buf;
 }
@@ -2373,7 +2391,8 @@
     dst->bSaoNonDeblocked = src->bSaoNonDeblocked;
     dst->rdLevel = src->rdLevel;
     dst->bEnableEarlySkip = src->bEnableEarlySkip;
-    dst->bEnableRecursionSkip = src->bEnableRecursionSkip;
+    dst->recursionSkipMode = src->recursionSkipMode;
+    dst->edgeVarThreshold = src->edgeVarThreshold;
     dst->bEnableFastIntra = src->bEnableFastIntra;
     dst->bEnableTSkipFast = src->bEnableTSkipFast;
     dst->bCULossless = src->bCULossless;
@@ -2419,8 +2438,9 @@
     dst->rc.zonefileCount = src->rc.zonefileCount;
     dst->reconfigWindowSize = src->reconfigWindowSize;
     dst->bResetZoneConfig = src->bResetZoneConfig;
+    dst->decoderVbvMaxRate = src->decoderVbvMaxRate;
 
-    if (src->rc.zonefileCount && src->rc.zones)
+    if (src->rc.zonefileCount && src->rc.zones && src->bResetZoneConfig)
     {
         for (int i = 0; i < src->rc.zonefileCount; i++)
         {
​

x265_3.3.tar.gz/source/common/pixel.cpp -> x265_3.4.tar.gz/source/common/pixel.cpp Changed

@@ -5,6 +5,7 @@
  *          Mandar Gurav <mandar@multicorewareinc.com>
  *          Mahesh Pittala <mahesh@multicorewareinc.com>
  *          Min Chen <min.chen@multicorewareinc.com>
+ *          Hongbin Liu<liuhongbin1@huawei.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -265,6 +266,10 @@
 {
     int satd = 0;
 
+#if ENABLE_ASSEMBLY && X265_ARCH_ARM64
+    pixelcmp_t satd_4x4 = x265_pixel_satd_4x4_neon;
+#endif
+
     for (int row = 0; row < h; row += 4)
         for (int col = 0; col < w; col += 4)
             satd += satd_4x4(pix1 + row * stride_pix1 + col, stride_pix1,
@@ -279,6 +284,10 @@
 {
     int satd = 0;
 
+#if ENABLE_ASSEMBLY && X265_ARCH_ARM64
+    pixelcmp_t satd_8x4 = x265_pixel_satd_8x4_neon;
+#endif
+
     for (int row = 0; row < h; row += 4)
         for (int col = 0; col < w; col += 8)
             satd += satd_8x4(pix1 + row * stride_pix1 + col, stride_pix1,
@@ -876,6 +885,18 @@
     }
 }
 
+static void planecopy_pp_shr_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift)
+{
+    for (int r = 0; r < height; r++)
+    {
+        for (int c = 0; c < width; c++)
+            dst[c] = (pixel)((src[c] >> shift));
+
+        dst += dstStride;
+        src += srcStride;
+    }
+}
+
 static void planecopy_sp_shl_c(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
 {
     for (int r = 0; r < height; r++)
@@ -1316,6 +1337,7 @@
     p.planecopy_cp = planecopy_cp_c;
     p.planecopy_sp = planecopy_sp_c;
     p.planecopy_sp_shl = planecopy_sp_shl_c;
+    p.planecopy_pp_shr = planecopy_pp_shr_c;
 #if HIGH_BIT_DEPTH
     p.planeClipAndMax = planeClipAndMax_c;
 #endif

 
@@ -5,6 +5,7 @@
  *          Mandar Gurav <mandar@multicorewareinc.com>
  *          Mahesh Pittala <mahesh@multicorewareinc.com>
  *          Min Chen <min.chen@multicorewareinc.com>
+ *          Hongbin Liu<liuhongbin1@huawei.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -265,6 +266,10 @@
 {
     int satd = 0;
 
+#if ENABLE_ASSEMBLY && X265_ARCH_ARM64
+    pixelcmp_t satd_4x4 = x265_pixel_satd_4x4_neon;
+#endif
+
     for (int row = 0; row < h; row += 4)
         for (int col = 0; col < w; col += 4)
             satd += satd_4x4(pix1 + row * stride_pix1 + col, stride_pix1,
@@ -279,6 +284,10 @@
 {
     int satd = 0;
 
+#if ENABLE_ASSEMBLY && X265_ARCH_ARM64
+    pixelcmp_t satd_8x4 = x265_pixel_satd_8x4_neon;
+#endif
+
     for (int row = 0; row < h; row += 4)
         for (int col = 0; col < w; col += 8)
             satd += satd_8x4(pix1 + row * stride_pix1 + col, stride_pix1,
@@ -876,6 +885,18 @@
     }
 }
 
+static void planecopy_pp_shr_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift)
+{
+    for (int r = 0; r < height; r++)
+    {
+        for (int c = 0; c < width; c++)
+            dst[c] = (pixel)((src[c] >> shift));
+
+        dst += dstStride;
+        src += srcStride;
+    }
+}
+
 static void planecopy_sp_shl_c(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
 {
     for (int r = 0; r < height; r++)
@@ -1316,6 +1337,7 @@
     p.planecopy_cp = planecopy_cp_c;
     p.planecopy_sp = planecopy_sp_c;
     p.planecopy_sp_shl = planecopy_sp_shl_c;
+    p.planecopy_pp_shr = planecopy_pp_shr_c;
 #if HIGH_BIT_DEPTH
     p.planeClipAndMax = planeClipAndMax_c;
 #endif
​

x265_3.3.tar.gz/source/common/primitives.h -> x265_3.4.tar.gz/source/common/primitives.h Changed

@@ -8,6 +8,8 @@
  *          Rajesh Paulraj <rajesh@multicorewareinc.com>
  *          Praveen Kumar Tiwari <praveen@multicorewareinc.com>
  *          Min Chen <chenm003@163.com>
+ *          Hongbin Liu<liuhongbin1@huawei.com>
+ *          Yimeng Su <yimeng.su@huawei.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -204,6 +206,7 @@
 typedef void (*sign_t)(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
 typedef void (*planecopy_cp_t) (const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
 typedef void (*planecopy_sp_t) (const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
+typedef void (*planecopy_pp_t) (const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
 typedef pixel (*planeClipAndMax_t)(pixel *src, intptr_t stride, int width, int height, uint64_t *outsum, const pixel minPix, const pixel maxPix);
 
 typedef void (*cutree_propagate_cost) (int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts, const int32_t* invQscales, const double* fpsFactor, int len);
@@ -358,6 +361,7 @@
     planecopy_cp_t        planecopy_cp;
     planecopy_sp_t        planecopy_sp;
     planecopy_sp_t        planecopy_sp_shl;
+    planecopy_pp_t        planecopy_pp_shr;
     planeClipAndMax_t     planeClipAndMax;
 
     weightp_sp_t          weight_sp;
@@ -465,6 +469,9 @@
 void setupInstrinsicPrimitives(EncoderPrimitives &p, int cpuMask);
 void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask);
 void setupAliasPrimitives(EncoderPrimitives &p);
+#if X265_ARCH_ARM64
+void setupAliasCPrimitives(EncoderPrimitives &cp, EncoderPrimitives &asmp, int cpuMask);
+#endif
 #if HAVE_ALTIVEC
 void setupPixelPrimitives_altivec(EncoderPrimitives &p);
 void setupDCTPrimitives_altivec(EncoderPrimitives &p);
@@ -479,4 +486,10 @@
 extern const char* PFX(build_info_str);
 #endif
 
+#if ENABLE_ASSEMBLY && X265_ARCH_ARM64
+extern "C" {
+#include "aarch64/pixel-util.h"
+}
+#endif
+
 #endif // ifndef X265_PRIMITIVES_H

 
@@ -8,6 +8,8 @@
  *          Rajesh Paulraj <rajesh@multicorewareinc.com>
  *          Praveen Kumar Tiwari <praveen@multicorewareinc.com>
  *          Min Chen <chenm003@163.com>
+ *          Hongbin Liu<liuhongbin1@huawei.com>
+ *          Yimeng Su <yimeng.su@huawei.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -204,6 +206,7 @@
 typedef void (*sign_t)(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
 typedef void (*planecopy_cp_t) (const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
 typedef void (*planecopy_sp_t) (const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
+typedef void (*planecopy_pp_t) (const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
 typedef pixel (*planeClipAndMax_t)(pixel *src, intptr_t stride, int width, int height, uint64_t *outsum, const pixel minPix, const pixel maxPix);
 
 typedef void (*cutree_propagate_cost) (int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts, const int32_t* invQscales, const double* fpsFactor, int len);
@@ -358,6 +361,7 @@
     planecopy_cp_t        planecopy_cp;
     planecopy_sp_t        planecopy_sp;
     planecopy_sp_t        planecopy_sp_shl;
+    planecopy_pp_t        planecopy_pp_shr;
     planeClipAndMax_t     planeClipAndMax;
 
     weightp_sp_t          weight_sp;
@@ -465,6 +469,9 @@
 void setupInstrinsicPrimitives(EncoderPrimitives &p, int cpuMask);
 void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask);
 void setupAliasPrimitives(EncoderPrimitives &p);
+#if X265_ARCH_ARM64
+void setupAliasCPrimitives(EncoderPrimitives &cp, EncoderPrimitives &asmp, int cpuMask);
+#endif
 #if HAVE_ALTIVEC
 void setupPixelPrimitives_altivec(EncoderPrimitives &p);
 void setupDCTPrimitives_altivec(EncoderPrimitives &p);
@@ -479,4 +486,10 @@
 extern const char* PFX(build_info_str);
 #endif
 
+#if ENABLE_ASSEMBLY && X265_ARCH_ARM64
+extern "C" {
+#include "aarch64/pixel-util.h"
+}
+#endif
+
 #endif // ifndef X265_PRIMITIVES_H
​

x265_3.4.tar.gz/source/common/scaler.cpp Added

@@ -0,0 +1,1110 @@
+/*****************************************************************************
+* Copyright (C) 2013-2020 MulticoreWare, Inc
+*
+* Authors: Pooja Venkatesan <pooja@multicorewareinc.com>
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by
+* the Free Software Foundation; either version 2 of the License, or
+* (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+*
+* This program is also available under a commercial proprietary license.
+* For more information, contact us at license @ x265.com.
+*****************************************************************************/
+
+#include "scaler.h"
+
+#if _MSC_VER
+#pragma warning(disable: 4706) // assignment within conditional
+#pragma warning(disable: 4244) // '=' : possible loss of data
+#endif
+
+#define SHORT_MIN (-(1 << 15))
+#define SHORT_MAX ((1 << 15) - 1)
+#define SHORT_MAX_10 ((1 << 10) - 1)
+
+namespace X265_NS{
+
+ScalerFilterManager::ScalerFilterManager() :
+    m_bitDepth(0),
+    m_algorithmFlags(0),
+    m_srcW(0),
+    m_srcH(0),
+    m_dstW(0),
+    m_dstH(0),
+    m_crSrcW(0),
+    m_crSrcH(0),
+    m_crDstW(0),
+    m_crDstH(0),
+    m_crSrcHSubSample(0),
+    m_crSrcVSubSample(0),
+    m_crDstHSubSample(0),
+    m_crDstVSubSample(0)
+{
+    for (int i = 0; i < m_numSlice; i++)
+        m_slices[i] = NULL;
+    for (int i = 0; i < m_numFilter; i++)
+        m_ScalerFilters[i] = NULL;
+}
+
+inline static void filter_copy_c(int64_t* filter, int64_t* filter2, int size)
+{
+    for (int i = 0; i < size; i++)
+        filter2[i] = filter[i];
+}
+
+#if X265_DEPTH == 8
+static void doScaling_c(int16_t *dst, int dstW, const uint8_t *src, const int16_t *filter, const int32_t *filterPos, int filterSize)
+{
+    for (int i = 0; i < dstW; i++)
+    {
+        int val = 0;
+        int sourcePos = filterPos[i];
+        for (int j = 0; j < filterSize; j++)
+            val += ((int)src[sourcePos + j]) * filter[filterSize * i + j];
+        // the cubic equation does overflow ...
+        dst[i] = x265_clip3(SHORT_MIN, SHORT_MAX, val >> 7);
+    }
+}
+static uint8_t clipUint8(int a)
+{
+    if (a&(~0xFF))
+        return (-a) >> 31;
+    else
+        return a;
+}
+
+static void yuv2PlaneX_c(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW)
+{
+    for (int i = 0; i < dstW; i++)
+    {
+        int val = 64 << 12;
+        for (int j = 0; j < filterSize; j++)
+            val += src[j][i] * filter[j];
+        dest[i] = clipUint8(val >> 19);
+    }
+}
+#else
+static void yuv2PlaneX_c_h(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW)
+{
+    for (int i = 0; i < dstW; i++)
+    {
+        int val = 1 << 16;
+        uint16_t* dst16bit = (uint16_t *)dest;
+        for (int j = 0; j < filterSize; j++)
+            val += src[j][i] * filter[j];
+        uint16_t d = x265_clip3(0, SHORT_MAX_10, val >> 17);
+        ((uint8_t*)(&dst16bit[i]))[0] = (d);
+        ((uint8_t*)(&dst16bit[i]))[1] = (d) >> 8;
+    }
+}
+static void doScaling_c_h(int16_t *dst, int dstW, const uint8_t *src, const int16_t *filter, const int32_t *filterPos, int filterSize)
+{
+    const uint16_t *srcLocal = (const uint16_t *)src;
+    for (int i = 0; i < dstW; i++)
+    {
+        int val = 0;
+        int sourcePos = filterPos[i];
+        for (int j = 0; j < filterSize; j++)
+            val += ((int)srcLocal[sourcePos + j]) * filter[filterSize * i + j];
+        // the cubic equation does overflow
+        dst[i] = x265_clip3(SHORT_MIN, SHORT_MAX, val >> 9);
+    }
+}
+#endif
+
+ScalerFilter::ScalerFilter() :
+    m_filtLen(0),
+    m_filtPos(NULL),
+    m_filt(NULL),
+    m_sourceSlice(NULL),
+    m_destSlice(NULL)
+{
+}
+
+ScalerFilter::~ScalerFilter()
+{
+    if (m_filtPos) {
+        delete[] m_filtPos; m_filtPos = NULL;
+    }
+    if (m_filt) {
+        delete[] m_filt; m_filt = NULL;
+    }
+}
+
+void ScalerHLumFilter::process(int sliceVer, int sliceHor)
+{
+    uint8_t ** src = m_sourceSlice->m_plane[0].lineBuf;
+    uint8_t ** dst = m_destSlice->m_plane[0].lineBuf;
+    int sourcePos = sliceVer - m_sourceSlice->m_plane[0].sliceVer;
+    int destPos = sliceVer - m_destSlice->m_plane[0].sliceVer;
+    int dstW = m_destSlice->m_width;
+    for (int i = 0; i < sliceHor; ++i)
+    {
+        m_hFilterScaler->doScaling((int16_t*)dst[destPos + i], dstW, (const uint8_t *)src[sourcePos + i], m_filt, m_filtPos, m_filtLen);
+        m_destSlice->m_plane[0].sliceHor += 1;
+    }
+}
+
+void ScalerHCrFilter::process(int sliceVer, int sliceHor)
+{
+    uint8_t ** src1 = m_sourceSlice->m_plane[1].lineBuf;
+    uint8_t ** dst1 = m_destSlice->m_plane[1].lineBuf;
+    uint8_t ** src2 = m_sourceSlice->m_plane[2].lineBuf;
+    uint8_t ** dst2 = m_destSlice->m_plane[2].lineBuf;
+
+    int sourcePos1 = sliceVer - m_sourceSlice->m_plane[1].sliceVer;
+    int destPos1 = sliceVer - m_destSlice->m_plane[1].sliceVer;
+    int sourcePos2 = sliceVer - m_sourceSlice->m_plane[2].sliceVer;
+    int destPos2 = sliceVer - m_destSlice->m_plane[2].sliceVer;
+
+    int dstW = m_destSlice->m_width >> m_destSlice->m_hCrSubSample;
+
+    for (int i = 0; i < sliceHor; ++i)
+    {
+        m_hFilterScaler->doScaling((int16_t*)dst1[destPos1 + i], dstW, src1[sourcePos1 + i], m_filt, m_filtPos, m_filtLen);
+        m_hFilterScaler->doScaling((int16_t*)dst2[destPos2 + i], dstW, src2[sourcePos2 + i], m_filt, m_filtPos, m_filtLen);
+        m_destSlice->m_plane[1].sliceHor += 1;
+        m_destSlice->m_plane[2].sliceHor += 1;
+    }
+}
+
+void VFilterScaler8Bit::yuv2PlaneX(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW)
+{
+    int IdxW = FACTOR_4;
+    int IdxF = FIL_DEF;
+
+    (dstW % 4 == 0) && (filterSize == 6) && (IdxF = FIL_6) && (IdxW = FACTOR_4);
+    (dstW % 4 == 0) && (filterSize == 8) && (IdxF = FIL_8) && (IdxW = FACTOR_4);
+
+#if X265_DEPTH == 8
+    yuv2PlaneX_c(filter, filterSize, src, dest, dstW);
+#else
+    yuv2PlaneX_c_h(filter, filterSize, src, dest, dstW);
+#endif
+}
+
+void VFilterScaler10Bit::yuv2PlaneX(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW)
+{
+    int IdxW = FACTOR_4;
+    int IdxF = FIL_DEF;

 
@@ -0,0 +1,1110 @@
+/*****************************************************************************
+* Copyright (C) 2013-2020 MulticoreWare, Inc
+*
+* Authors: Pooja Venkatesan <pooja@multicorewareinc.com>
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by
+* the Free Software Foundation; either version 2 of the License, or
+* (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+*
+* This program is also available under a commercial proprietary license.
+* For more information, contact us at license @ x265.com.
+*****************************************************************************/
+
+#include "scaler.h"
+
+#if _MSC_VER
+#pragma warning(disable: 4706) // assignment within conditional
+#pragma warning(disable: 4244) // '=' : possible loss of data
+#endif
+
+#define SHORT_MIN (-(1 << 15))
+#define SHORT_MAX ((1 << 15) - 1)
+#define SHORT_MAX_10 ((1 << 10) - 1)
+
+namespace X265_NS{
+
+ScalerFilterManager::ScalerFilterManager() :
+    m_bitDepth(0),
+    m_algorithmFlags(0),
+    m_srcW(0),
+    m_srcH(0),
+    m_dstW(0),
+    m_dstH(0),
+    m_crSrcW(0),
+    m_crSrcH(0),
+    m_crDstW(0),
+    m_crDstH(0),
+    m_crSrcHSubSample(0),
+    m_crSrcVSubSample(0),
+    m_crDstHSubSample(0),
+    m_crDstVSubSample(0)
+{
+    for (int i = 0; i < m_numSlice; i++)
+        m_slices[i] = NULL;
+    for (int i = 0; i < m_numFilter; i++)
+        m_ScalerFilters[i] = NULL;
+}
+
+inline static void filter_copy_c(int64_t* filter, int64_t* filter2, int size)
+{
+    for (int i = 0; i < size; i++)
+        filter2[i] = filter[i];
+}
+
+#if X265_DEPTH == 8
+static void doScaling_c(int16_t *dst, int dstW, const uint8_t *src, const int16_t *filter, const int32_t *filterPos, int filterSize)
+{
+    for (int i = 0; i < dstW; i++)
+    {
+        int val = 0;
+        int sourcePos = filterPos[i];
+        for (int j = 0; j < filterSize; j++)
+            val += ((int)src[sourcePos + j]) * filter[filterSize * i + j];
+        // the cubic equation does overflow ...
+        dst[i] = x265_clip3(SHORT_MIN, SHORT_MAX, val >> 7);
+    }
+}
+static uint8_t clipUint8(int a)
+{
+    if (a&(~0xFF))
+        return (-a) >> 31;
+    else
+        return a;
+}
+
+static void yuv2PlaneX_c(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW)
+{
+    for (int i = 0; i < dstW; i++)
+    {
+        int val = 64 << 12;
+        for (int j = 0; j < filterSize; j++)
+            val += src[j][i] * filter[j];
+        dest[i] = clipUint8(val >> 19);
+    }
+}
+#else
+static void yuv2PlaneX_c_h(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW)
+{
+    for (int i = 0; i < dstW; i++)
+    {
+        int val = 1 << 16;
+        uint16_t* dst16bit = (uint16_t *)dest;
+        for (int j = 0; j < filterSize; j++)
+            val += src[j][i] * filter[j];
+        uint16_t d = x265_clip3(0, SHORT_MAX_10, val >> 17);
+        ((uint8_t*)(&dst16bit[i]))[0] = (d);
+        ((uint8_t*)(&dst16bit[i]))[1] = (d) >> 8;
+    }
+}
+static void doScaling_c_h(int16_t *dst, int dstW, const uint8_t *src, const int16_t *filter, const int32_t *filterPos, int filterSize)
+{
+    const uint16_t *srcLocal = (const uint16_t *)src;
+    for (int i = 0; i < dstW; i++)
+    {
+        int val = 0;
+        int sourcePos = filterPos[i];
+        for (int j = 0; j < filterSize; j++)
+            val += ((int)srcLocal[sourcePos + j]) * filter[filterSize * i + j];
+        // the cubic equation does overflow
+        dst[i] = x265_clip3(SHORT_MIN, SHORT_MAX, val >> 9);
+    }
+}
+#endif
+
+ScalerFilter::ScalerFilter() :
+    m_filtLen(0),
+    m_filtPos(NULL),
+    m_filt(NULL),
+    m_sourceSlice(NULL),
+    m_destSlice(NULL)
+{
+}
+
+ScalerFilter::~ScalerFilter()
+{
+    if (m_filtPos) {
+        delete[] m_filtPos; m_filtPos = NULL;
+    }
+    if (m_filt) {
+        delete[] m_filt; m_filt = NULL;
+    }
+}
+
+void ScalerHLumFilter::process(int sliceVer, int sliceHor)
+{
+    uint8_t ** src = m_sourceSlice->m_plane[0].lineBuf;
+    uint8_t ** dst = m_destSlice->m_plane[0].lineBuf;
+    int sourcePos = sliceVer - m_sourceSlice->m_plane[0].sliceVer;
+    int destPos = sliceVer - m_destSlice->m_plane[0].sliceVer;
+    int dstW = m_destSlice->m_width;
+    for (int i = 0; i < sliceHor; ++i)
+    {
+        m_hFilterScaler->doScaling((int16_t*)dst[destPos + i], dstW, (const uint8_t *)src[sourcePos + i], m_filt, m_filtPos, m_filtLen);
+        m_destSlice->m_plane[0].sliceHor += 1;
+    }
+}
+
+void ScalerHCrFilter::process(int sliceVer, int sliceHor)
+{
+    uint8_t ** src1 = m_sourceSlice->m_plane[1].lineBuf;
+    uint8_t ** dst1 = m_destSlice->m_plane[1].lineBuf;
+    uint8_t ** src2 = m_sourceSlice->m_plane[2].lineBuf;
+    uint8_t ** dst2 = m_destSlice->m_plane[2].lineBuf;
+
+    int sourcePos1 = sliceVer - m_sourceSlice->m_plane[1].sliceVer;
+    int destPos1 = sliceVer - m_destSlice->m_plane[1].sliceVer;
+    int sourcePos2 = sliceVer - m_sourceSlice->m_plane[2].sliceVer;
+    int destPos2 = sliceVer - m_destSlice->m_plane[2].sliceVer;
+
+    int dstW = m_destSlice->m_width >> m_destSlice->m_hCrSubSample;
+
+    for (int i = 0; i < sliceHor; ++i)
+    {
+        m_hFilterScaler->doScaling((int16_t*)dst1[destPos1 + i], dstW, src1[sourcePos1 + i], m_filt, m_filtPos, m_filtLen);
+        m_hFilterScaler->doScaling((int16_t*)dst2[destPos2 + i], dstW, src2[sourcePos2 + i], m_filt, m_filtPos, m_filtLen);
+        m_destSlice->m_plane[1].sliceHor += 1;
+        m_destSlice->m_plane[2].sliceHor += 1;
+    }
+}
+
+void VFilterScaler8Bit::yuv2PlaneX(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW)
+{
+    int IdxW = FACTOR_4;
+    int IdxF = FIL_DEF;
+
+    (dstW % 4 == 0) && (filterSize == 6) && (IdxF = FIL_6) && (IdxW = FACTOR_4);
+    (dstW % 4 == 0) && (filterSize == 8) && (IdxF = FIL_8) && (IdxW = FACTOR_4);
+
+#if X265_DEPTH == 8
+    yuv2PlaneX_c(filter, filterSize, src, dest, dstW);
+#else
+    yuv2PlaneX_c_h(filter, filterSize, src, dest, dstW);
+#endif
+}
+
+void VFilterScaler10Bit::yuv2PlaneX(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW)
+{
+    int IdxW = FACTOR_4;
+    int IdxF = FIL_DEF;
​

x265_3.4.tar.gz/source/common/scaler.h Added

@@ -0,0 +1,254 @@
+/*****************************************************************************
+ * Copyright (C) 2013-2020 MulticoreWare, Inc
+ *
+ * Authors: Pooja Venkatesan <pooja@multicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_SCALER_H
+#define X265_SCALER_H
+
+#include "common.h"
+
+namespace X265_NS {
+//x265 private namespace
+
+class ScalerSlice;
+class VideoDesc;
+
+#define MAX_NUM_LINES_AHEAD 4
+#define SCALER_ALIGN(x, j) (((x)+(j)-1)&~((j)-1))
+#define X265_ABS(j) ((j) >= 0 ? (j) : (-(j)))
+#define SCALER_MAX_REDUCE_CUTOFF 0.002
+#define SCALER_BITEXACT  0x80000
+#define ROUNDED_DIVISION(i,j) (((i)>0 ? (i) + ((j)>>1) : (i) - ((j)>>1))/(j))
+#define UH_CEIL_SHIFTR(i,j) (!scale_builtin_constant_p(j) ? -((-(i)) >> (j)) \
+                                                          : ((i) + (1<<(j)) - 1) >> (j))
+
+#if defined(__GNUC__) || defined(__clang__)
+#    define scale_builtin_constant_p __builtin_constant_p
+#else
+#    define scale_builtin_constant_p(x) 0
+#endif
+
+enum ResFactor
+{
+    RES_FACTOR_64, RES_FACTOR_32, RES_FACTOR_16, RES_FACTOR_8,
+    RES_FACTOR_4, RES_FACTOR_DEF, NUM_RES_FACTOR
+};
+
+enum ScalerFactor
+{
+    FACTOR_4, FACTOR_8, NUM_FACTOR
+};
+
+enum FilterSize
+{
+    FIL_4, FIL_6, FIL_8, FIL_9, FIL_10, FIL_11, FIL_13, FIL_15,
+    FIL_16, FIL_17, FIL_19, FIL_22, FIL_24, FIL_DEF, NUM_FIL
+};
+
+class ScalerFilter {
+public:
+    int             m_filtLen;
+    int32_t*        m_filtPos;      // Array of horizontal/vertical starting pos for each dst for luma / chroma planes.
+    int16_t*        m_filt;         // Array of horizontal/vertical filter coefficients for luma / chroma planes.
+    ScalerSlice*    m_sourceSlice;  // Source slice
+    ScalerSlice*    m_destSlice;    // Output slice
+    ScalerFilter();
+    virtual ~ScalerFilter();
+    virtual void process(int sliceVer, int sliceHor) = 0;
+    int initCoeff(int flag, int inc, int srcW, int dstW, int filtAlign, int one, int sourcePos, int destPos);
+    void setSlice(ScalerSlice* source, ScalerSlice* dest) { m_sourceSlice = source; m_destSlice = dest; }
+};
+
+class VideoDesc {
+public:
+    int         m_width;
+    int         m_height;
+    int         m_csp;
+    int         m_inputDepth;
+
+    VideoDesc(int w, int h, int csp, int bitDepth)
+    {
+        m_width = w;
+        m_height = h;
+        m_csp = csp;
+        m_inputDepth = bitDepth;
+    }
+};
+
+typedef struct ScalerPlane
+{
+    int       availLines; // max number of lines that can be held by this plane
+    int       sliceVer;   // index of first line
+    int       sliceHor;   // number of lines
+    uint8_t** lineBuf;    // line buffer
+} ScalerPlane;
+
+// Assist horizontal filtering, base class
+class HFilterScaler {
+public:
+    int m_bitDepth;
+public:
+    HFilterScaler() :m_bitDepth(0) {};
+    virtual ~HFilterScaler() {};
+    virtual void doScaling(int16_t *dst, int dstW, const uint8_t *src, const int16_t *filter, const int32_t *filterPos, int filterSize) = 0;
+};
+
+// Assist vertical filtering, base class
+class VFilterScaler {
+public:
+    int m_bitDepth;
+public:
+    VFilterScaler() :m_bitDepth(0) {};
+    virtual ~VFilterScaler() {};
+    virtual void yuv2PlaneX(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW) = 0;
+};
+
+//  Assist horizontal filtering, process 8 bit case
+class HFilterScaler8Bit : public HFilterScaler {
+public:
+    HFilterScaler8Bit() { m_bitDepth = 8; }
+    virtual void doScaling(int16_t *dst, int dstW, const uint8_t *src, const int16_t *filter, const int32_t *filterPos, int filterSize);
+};
+
+//  Assist horizontal filtering, process 10 bit case
+class HFilterScaler10Bit : public HFilterScaler {
+public:
+    HFilterScaler10Bit() { m_bitDepth = 10; }
+    virtual void doScaling(int16_t *dst, int dstW, const uint8_t *src, const int16_t *filter, const int32_t *filterPos, int filterSize);
+};
+
+//  Assist vertical filtering, process 8 bit case
+class VFilterScaler8Bit : public VFilterScaler {
+public:
+    VFilterScaler8Bit() { m_bitDepth = 8; }
+    virtual void yuv2PlaneX(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW);
+};
+
+//  Assist vertical filtering, process 10 bit case
+class VFilterScaler10Bit : public VFilterScaler {
+public:
+    VFilterScaler10Bit() { m_bitDepth = 10; }
+    virtual void yuv2PlaneX(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW);
+};
+
+// Horizontal filter for luma
+class ScalerHLumFilter : public ScalerFilter {
+private:
+    HFilterScaler* m_hFilterScaler;
+public:
+    ScalerHLumFilter(int bitDepth) { bitDepth == 8 ? m_hFilterScaler = new HFilterScaler8Bit : bitDepth == 10 ? m_hFilterScaler = new HFilterScaler10Bit : NULL;}
+    ~ScalerHLumFilter() { if (m_hFilterScaler) X265_FREE(m_hFilterScaler); }
+    virtual void process(int sliceVer, int sliceHor);
+};
+
+// Horizontal filter for chroma
+class ScalerHCrFilter : public ScalerFilter {
+private:
+    HFilterScaler* m_hFilterScaler;
+public:
+    ScalerHCrFilter(int bitDepth) { bitDepth == 8 ? m_hFilterScaler = new HFilterScaler8Bit : bitDepth == 10 ? m_hFilterScaler = new HFilterScaler10Bit : NULL;}
+    ~ScalerHCrFilter() { if (m_hFilterScaler) X265_FREE(m_hFilterScaler); }
+    virtual void process(int sliceVer, int sliceHor);
+};
+
+// Vertical filter for luma
+class ScalerVLumFilter : public ScalerFilter {
+private:
+    VFilterScaler* m_vFilterScaler;
+public:
+    ScalerVLumFilter(int bitDepth) { bitDepth == 8 ? m_vFilterScaler = new VFilterScaler8Bit : bitDepth == 10 ? m_vFilterScaler = new VFilterScaler10Bit : NULL;}
+    ~ScalerVLumFilter() { if (m_vFilterScaler) X265_FREE(m_vFilterScaler); }
+    virtual void process(int sliceVer, int sliceHor);
+};
+
+// Vertical filter for chroma
+class ScalerVCrFilter : public ScalerFilter {
+private:
+    VFilterScaler*    m_vFilterScaler;
+public:
+    ScalerVCrFilter(int bitDepth) { bitDepth == 8 ? m_vFilterScaler = new VFilterScaler8Bit : bitDepth == 10 ? m_vFilterScaler = new VFilterScaler10Bit : NULL;}
+    ~ScalerVCrFilter() { if (m_vFilterScaler) X265_FREE(m_vFilterScaler); }
+    virtual void process(int sliceVer, int sliceHor);
+};
+
+class ScalerSlice
+{
+private:
+    enum ScalerSlicePlaneNum { m_numSlicePlane = 4 };
+public:
+    int m_width;        // Slice line width
+    int m_hCrSubSample; // horizontal Chroma subsampling factor

 
@@ -0,0 +1,254 @@
+/*****************************************************************************
+ * Copyright (C) 2013-2020 MulticoreWare, Inc
+ *
+ * Authors: Pooja Venkatesan <pooja@multicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_SCALER_H
+#define X265_SCALER_H
+
+#include "common.h"
+
+namespace X265_NS {
+//x265 private namespace
+
+class ScalerSlice;
+class VideoDesc;
+
+#define MAX_NUM_LINES_AHEAD 4
+#define SCALER_ALIGN(x, j) (((x)+(j)-1)&~((j)-1))
+#define X265_ABS(j) ((j) >= 0 ? (j) : (-(j)))
+#define SCALER_MAX_REDUCE_CUTOFF 0.002
+#define SCALER_BITEXACT  0x80000
+#define ROUNDED_DIVISION(i,j) (((i)>0 ? (i) + ((j)>>1) : (i) - ((j)>>1))/(j))
+#define UH_CEIL_SHIFTR(i,j) (!scale_builtin_constant_p(j) ? -((-(i)) >> (j)) \
+                                                          : ((i) + (1<<(j)) - 1) >> (j))
+
+#if defined(__GNUC__) || defined(__clang__)
+#    define scale_builtin_constant_p __builtin_constant_p
+#else
+#    define scale_builtin_constant_p(x) 0
+#endif
+
+enum ResFactor
+{
+    RES_FACTOR_64, RES_FACTOR_32, RES_FACTOR_16, RES_FACTOR_8,
+    RES_FACTOR_4, RES_FACTOR_DEF, NUM_RES_FACTOR
+};
+
+enum ScalerFactor
+{
+    FACTOR_4, FACTOR_8, NUM_FACTOR
+};
+
+enum FilterSize
+{
+    FIL_4, FIL_6, FIL_8, FIL_9, FIL_10, FIL_11, FIL_13, FIL_15,
+    FIL_16, FIL_17, FIL_19, FIL_22, FIL_24, FIL_DEF, NUM_FIL
+};
+
+class ScalerFilter {
+public:
+    int             m_filtLen;
+    int32_t*        m_filtPos;      // Array of horizontal/vertical starting pos for each dst for luma / chroma planes.
+    int16_t*        m_filt;         // Array of horizontal/vertical filter coefficients for luma / chroma planes.
+    ScalerSlice*    m_sourceSlice;  // Source slice
+    ScalerSlice*    m_destSlice;    // Output slice
+    ScalerFilter();
+    virtual ~ScalerFilter();
+    virtual void process(int sliceVer, int sliceHor) = 0;
+    int initCoeff(int flag, int inc, int srcW, int dstW, int filtAlign, int one, int sourcePos, int destPos);
+    void setSlice(ScalerSlice* source, ScalerSlice* dest) { m_sourceSlice = source; m_destSlice = dest; }
+};
+
+class VideoDesc {
+public:
+    int         m_width;
+    int         m_height;
+    int         m_csp;
+    int         m_inputDepth;
+
+    VideoDesc(int w, int h, int csp, int bitDepth)
+    {
+        m_width = w;
+        m_height = h;
+        m_csp = csp;
+        m_inputDepth = bitDepth;
+    }
+};
+
+typedef struct ScalerPlane
+{
+    int       availLines; // max number of lines that can be held by this plane
+    int       sliceVer;   // index of first line
+    int       sliceHor;   // number of lines
+    uint8_t** lineBuf;    // line buffer
+} ScalerPlane;
+
+// Assist horizontal filtering, base class
+class HFilterScaler {
+public:
+    int m_bitDepth;
+public:
+    HFilterScaler() :m_bitDepth(0) {};
+    virtual ~HFilterScaler() {};
+    virtual void doScaling(int16_t *dst, int dstW, const uint8_t *src, const int16_t *filter, const int32_t *filterPos, int filterSize) = 0;
+};
+
+// Assist vertical filtering, base class
+class VFilterScaler {
+public:
+    int m_bitDepth;
+public:
+    VFilterScaler() :m_bitDepth(0) {};
+    virtual ~VFilterScaler() {};
+    virtual void yuv2PlaneX(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW) = 0;
+};
+
+//  Assist horizontal filtering, process 8 bit case
+class HFilterScaler8Bit : public HFilterScaler {
+public:
+    HFilterScaler8Bit() { m_bitDepth = 8; }
+    virtual void doScaling(int16_t *dst, int dstW, const uint8_t *src, const int16_t *filter, const int32_t *filterPos, int filterSize);
+};
+
+//  Assist horizontal filtering, process 10 bit case
+class HFilterScaler10Bit : public HFilterScaler {
+public:
+    HFilterScaler10Bit() { m_bitDepth = 10; }
+    virtual void doScaling(int16_t *dst, int dstW, const uint8_t *src, const int16_t *filter, const int32_t *filterPos, int filterSize);
+};
+
+//  Assist vertical filtering, process 8 bit case
+class VFilterScaler8Bit : public VFilterScaler {
+public:
+    VFilterScaler8Bit() { m_bitDepth = 8; }
+    virtual void yuv2PlaneX(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW);
+};
+
+//  Assist vertical filtering, process 10 bit case
+class VFilterScaler10Bit : public VFilterScaler {
+public:
+    VFilterScaler10Bit() { m_bitDepth = 10; }
+    virtual void yuv2PlaneX(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW);
+};
+
+// Horizontal filter for luma
+class ScalerHLumFilter : public ScalerFilter {
+private:
+    HFilterScaler* m_hFilterScaler;
+public:
+    ScalerHLumFilter(int bitDepth) { bitDepth == 8 ? m_hFilterScaler = new HFilterScaler8Bit : bitDepth == 10 ? m_hFilterScaler = new HFilterScaler10Bit : NULL;}
+    ~ScalerHLumFilter() { if (m_hFilterScaler) X265_FREE(m_hFilterScaler); }
+    virtual void process(int sliceVer, int sliceHor);
+};
+
+// Horizontal filter for chroma
+class ScalerHCrFilter : public ScalerFilter {
+private:
+    HFilterScaler* m_hFilterScaler;
+public:
+    ScalerHCrFilter(int bitDepth) { bitDepth == 8 ? m_hFilterScaler = new HFilterScaler8Bit : bitDepth == 10 ? m_hFilterScaler = new HFilterScaler10Bit : NULL;}
+    ~ScalerHCrFilter() { if (m_hFilterScaler) X265_FREE(m_hFilterScaler); }
+    virtual void process(int sliceVer, int sliceHor);
+};
+
+// Vertical filter for luma
+class ScalerVLumFilter : public ScalerFilter {
+private:
+    VFilterScaler* m_vFilterScaler;
+public:
+    ScalerVLumFilter(int bitDepth) { bitDepth == 8 ? m_vFilterScaler = new VFilterScaler8Bit : bitDepth == 10 ? m_vFilterScaler = new VFilterScaler10Bit : NULL;}
+    ~ScalerVLumFilter() { if (m_vFilterScaler) X265_FREE(m_vFilterScaler); }
+    virtual void process(int sliceVer, int sliceHor);
+};
+
+// Vertical filter for chroma
+class ScalerVCrFilter : public ScalerFilter {
+private:
+    VFilterScaler*    m_vFilterScaler;
+public:
+    ScalerVCrFilter(int bitDepth) { bitDepth == 8 ? m_vFilterScaler = new VFilterScaler8Bit : bitDepth == 10 ? m_vFilterScaler = new VFilterScaler10Bit : NULL;}
+    ~ScalerVCrFilter() { if (m_vFilterScaler) X265_FREE(m_vFilterScaler); }
+    virtual void process(int sliceVer, int sliceHor);
+};
+
+class ScalerSlice
+{
+private:
+    enum ScalerSlicePlaneNum { m_numSlicePlane = 4 };
+public:
+    int m_width;        // Slice line width
+    int m_hCrSubSample; // horizontal Chroma subsampling factor
​

x265_3.3.tar.gz/source/common/threading.h -> x265_3.4.tar.gz/source/common/threading.h Changed

 
@@ -238,6 +238,14 @@
         LeaveCriticalSection(&m_cs);
     }
 
+    void decr()
+    {
+        EnterCriticalSection(&m_cs);
+        m_val--;
+        WakeAllConditionVariable(&m_cv);
+        LeaveCriticalSection(&m_cs);
+    }
+
 protected:
 
     CRITICAL_SECTION   m_cs;
@@ -436,6 +444,14 @@
         pthread_mutex_unlock(&m_mutex);
     }
 
+    void decr()
+    {
+        pthread_mutex_lock(&m_mutex);
+        m_val--;
+        pthread_cond_broadcast(&m_cond);
+        pthread_mutex_unlock(&m_mutex);
+    }
+
 protected:
 
     pthread_mutex_t m_mutex;
​

x265_3.3.tar.gz/source/encoder/analysis.cpp -> x265_3.4.tar.gz/source/encoder/analysis.cpp Changed

@@ -1272,7 +1272,7 @@
                     md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
                     checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
 
-                    skipRecursion = !!m_param->bEnableRecursionSkip && md.bestMode;
+                    skipRecursion = !!m_param->recursionSkipMode && md.bestMode;
                     if (m_param->rdLevel)
                         skipModes = m_param->bEnableEarlySkip && md.bestMode;
                 }
@@ -1296,7 +1296,7 @@
                     md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
                     checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
 
-                    skipRecursion = !!m_param->bEnableRecursionSkip && md.bestMode;
+                    skipRecursion = !!m_param->recursionSkipMode && md.bestMode;
                     if (m_param->rdLevel)
                         skipModes = m_param->bEnableEarlySkip && md.bestMode;
                 }
@@ -1314,15 +1314,23 @@
                 skipModes = (m_param->bEnableEarlySkip || m_refineLevel == 2)
                 && md.bestMode && md.bestMode->cu.isSkipped(0); // TODO: sa8d threshold per depth
         }
-        if (md.bestMode && m_param->bEnableRecursionSkip && !bCtuInfoCheck && !(m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel == 7 && (m_modeFlag[0] || m_modeFlag[1])))
+        if (md.bestMode && m_param->recursionSkipMode && !bCtuInfoCheck && !(m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel == 7 && (m_modeFlag[0] || m_modeFlag[1])))
         {
             skipRecursion = md.bestMode->cu.isSkipped(0);
-            if (mightSplit && depth >= minDepth && !skipRecursion)
+            if (mightSplit && !skipRecursion)
             {
-                if (depth)
-                    skipRecursion = recursionDepthCheck(parentCTU, cuGeom, *md.bestMode);
-                if (m_bHD && !skipRecursion && m_param->rdLevel == 2 && md.fencYuv.m_size != MAX_CU_SIZE)
+                if (depth >= minDepth && m_param->recursionSkipMode == RDCOST_BASED_RSKIP)
+                {
+                    if (depth)
+                        skipRecursion = recursionDepthCheck(parentCTU, cuGeom, *md.bestMode);
+                    if (m_bHD && !skipRecursion && m_param->rdLevel == 2 && md.fencYuv.m_size != MAX_CU_SIZE)
+                        skipRecursion = complexityCheckCU(*md.bestMode);
+                }
+                else if (cuGeom.log2CUSize >= MAX_LOG2_CU_SIZE - 1 && m_param->recursionSkipMode == EDGE_BASED_RSKIP)
+                {
                     skipRecursion = complexityCheckCU(*md.bestMode);
+                }
+
             }
         }
         if (m_param->bAnalysisType == AVC_INFO && md.bestMode && cuGeom.numPartitions <= 16 && m_param->analysisLoadReuseLevel == 7)
@@ -1972,7 +1980,7 @@
                     checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks);
                     checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth);
 
-                    if (m_param->bEnableRecursionSkip && depth && m_modeDepth[depth - 1].bestMode)
+                    if (m_param->recursionSkipMode && depth && m_modeDepth[depth - 1].bestMode)
                         skipRecursion = md.bestMode && !md.bestMode->cu.getQtRootCbf(0);
                 }
                 if (m_param->analysisLoadReuseLevel > 4 && m_reusePartSize[cuGeom.absPartIdx] == SIZE_2Nx2N)
@@ -1996,7 +2004,7 @@
                     checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks);
                     checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth);
 
-                    if (m_param->bEnableRecursionSkip && depth && m_modeDepth[depth - 1].bestMode)
+                    if (m_param->recursionSkipMode && depth && m_modeDepth[depth - 1].bestMode)
                         skipRecursion = md.bestMode && !md.bestMode->cu.getQtRootCbf(0);
                 }
             }
@@ -2015,8 +2023,10 @@
             checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks);
             checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth);
 
-            if (m_param->bEnableRecursionSkip && depth && m_modeDepth[depth - 1].bestMode)
+            if (m_param->recursionSkipMode == RDCOST_BASED_RSKIP && depth && m_modeDepth[depth - 1].bestMode)
                 skipRecursion = md.bestMode && !md.bestMode->cu.getQtRootCbf(0);
+            else if (cuGeom.log2CUSize >= MAX_LOG2_CU_SIZE - 1 && m_param->recursionSkipMode == EDGE_BASED_RSKIP)
+                skipRecursion = md.bestMode && complexityCheckCU(*md.bestMode);
         }
         if (m_param->bAnalysisType == AVC_INFO && md.bestMode && cuGeom.numPartitions <= 16 && m_param->analysisLoadReuseLevel == 7)
             skipRecursion = true;
@@ -3525,27 +3535,47 @@
 
 bool Analysis::complexityCheckCU(const Mode& bestMode)
 {
-    uint32_t mean = 0;
-    uint32_t homo = 0;
-    uint32_t cuSize = bestMode.fencYuv->m_size;
-    for (uint32_t y = 0; y < cuSize; y++) {
-        for (uint32_t x = 0; x < cuSize; x++) {
-            mean += (bestMode.fencYuv->m_buf[0][y * cuSize + x]);
+    if (m_param->recursionSkipMode == RDCOST_BASED_RSKIP)
+    {
+        uint32_t mean = 0;
+        uint32_t homo = 0;
+        uint32_t cuSize = bestMode.fencYuv->m_size;
+        for (uint32_t y = 0; y < cuSize; y++) {
+            for (uint32_t x = 0; x < cuSize; x++) {
+                mean += (bestMode.fencYuv->m_buf[0][y * cuSize + x]);
+            }
         }
-    }
-    mean = mean / (cuSize * cuSize);
-    for (uint32_t y = 0 ; y < cuSize; y++){
-        for (uint32_t x = 0 ; x < cuSize; x++){
-            homo += abs(int(bestMode.fencYuv->m_buf[0][y * cuSize + x] - mean));
+        mean = mean / (cuSize * cuSize);
+        for (uint32_t y = 0; y < cuSize; y++) {
+            for (uint32_t x = 0; x < cuSize; x++) {
+                homo += abs(int(bestMode.fencYuv->m_buf[0][y * cuSize + x] - mean));
+            }
         }
-    }
-    homo = homo / (cuSize * cuSize);
+        homo = homo / (cuSize * cuSize);
 
-    if (homo < (.1 * mean))
-        return true;
+        if (homo < (.1 * mean))
+            return true;
 
-    return false;
-}
+        return false;
+    }
+    else
+    {
+        int blockType = bestMode.cu.m_log2CUSize[0] - LOG2_UNIT_SIZE;
+        int shift = bestMode.cu.m_log2CUSize[0] * LOG2_UNIT_SIZE;
+        intptr_t stride = m_frame->m_fencPic->m_stride;
+        intptr_t blockOffsetLuma = bestMode.cu.m_cuPelX + bestMode.cu.m_cuPelY * stride;
+        uint64_t sum_ss = primitives.cu[blockType].var(m_frame->m_edgeBitPic + blockOffsetLuma, stride);
+        uint32_t sum = (uint32_t)sum_ss;
+        uint32_t ss = (uint32_t)(sum_ss >> 32);
+        uint32_t pixelCount = 1 << shift;
+        double cuEdgeVariance = (ss - ((double)sum * sum / pixelCount)) / pixelCount;
+
+        if (cuEdgeVariance > (double)m_param->edgeVarThreshold)
+            return false;
+        else
+            return true;
+    }
+ }
 
 uint32_t Analysis::calculateCUVariance(const CUData& ctu, const CUGeom& cuGeom)
 {
@@ -3570,7 +3600,6 @@
             cnt++;
         }
     }
-    
     return cuVariance / cnt;
 }

 
@@ -1272,7 +1272,7 @@
                     md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
                     checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
 
-                    skipRecursion = !!m_param->bEnableRecursionSkip && md.bestMode;
+                    skipRecursion = !!m_param->recursionSkipMode && md.bestMode;
                     if (m_param->rdLevel)
                         skipModes = m_param->bEnableEarlySkip && md.bestMode;
                 }
@@ -1296,7 +1296,7 @@
                     md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
                     checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
 
-                    skipRecursion = !!m_param->bEnableRecursionSkip && md.bestMode;
+                    skipRecursion = !!m_param->recursionSkipMode && md.bestMode;
                     if (m_param->rdLevel)
                         skipModes = m_param->bEnableEarlySkip && md.bestMode;
                 }
@@ -1314,15 +1314,23 @@
                 skipModes = (m_param->bEnableEarlySkip || m_refineLevel == 2)
                 && md.bestMode && md.bestMode->cu.isSkipped(0); // TODO: sa8d threshold per depth
         }
-        if (md.bestMode && m_param->bEnableRecursionSkip && !bCtuInfoCheck && !(m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel == 7 && (m_modeFlag[0] || m_modeFlag[1])))
+        if (md.bestMode && m_param->recursionSkipMode && !bCtuInfoCheck && !(m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel == 7 && (m_modeFlag[0] || m_modeFlag[1])))
         {
             skipRecursion = md.bestMode->cu.isSkipped(0);
-            if (mightSplit && depth >= minDepth && !skipRecursion)
+            if (mightSplit && !skipRecursion)
             {
-                if (depth)
-                    skipRecursion = recursionDepthCheck(parentCTU, cuGeom, *md.bestMode);
-                if (m_bHD && !skipRecursion && m_param->rdLevel == 2 && md.fencYuv.m_size != MAX_CU_SIZE)
+                if (depth >= minDepth && m_param->recursionSkipMode == RDCOST_BASED_RSKIP)
+                {
+                    if (depth)
+                        skipRecursion = recursionDepthCheck(parentCTU, cuGeom, *md.bestMode);
+                    if (m_bHD && !skipRecursion && m_param->rdLevel == 2 && md.fencYuv.m_size != MAX_CU_SIZE)
+                        skipRecursion = complexityCheckCU(*md.bestMode);
+                }
+                else if (cuGeom.log2CUSize >= MAX_LOG2_CU_SIZE - 1 && m_param->recursionSkipMode == EDGE_BASED_RSKIP)
+                {
                     skipRecursion = complexityCheckCU(*md.bestMode);
+                }
+
             }
         }
         if (m_param->bAnalysisType == AVC_INFO && md.bestMode && cuGeom.numPartitions <= 16 && m_param->analysisLoadReuseLevel == 7)
@@ -1972,7 +1980,7 @@
                     checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks);
                     checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth);
 
-                    if (m_param->bEnableRecursionSkip && depth && m_modeDepth[depth - 1].bestMode)
+                    if (m_param->recursionSkipMode && depth && m_modeDepth[depth - 1].bestMode)
                         skipRecursion = md.bestMode && !md.bestMode->cu.getQtRootCbf(0);
                 }
                 if (m_param->analysisLoadReuseLevel > 4 && m_reusePartSize[cuGeom.absPartIdx] == SIZE_2Nx2N)
@@ -1996,7 +2004,7 @@
                     checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks);
                     checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth);
 
-                    if (m_param->bEnableRecursionSkip && depth && m_modeDepth[depth - 1].bestMode)
+                    if (m_param->recursionSkipMode && depth && m_modeDepth[depth - 1].bestMode)
                         skipRecursion = md.bestMode && !md.bestMode->cu.getQtRootCbf(0);
                 }
             }
@@ -2015,8 +2023,10 @@
             checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks);
             checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth);
 
-            if (m_param->bEnableRecursionSkip && depth && m_modeDepth[depth - 1].bestMode)
+            if (m_param->recursionSkipMode == RDCOST_BASED_RSKIP && depth && m_modeDepth[depth - 1].bestMode)
                 skipRecursion = md.bestMode && !md.bestMode->cu.getQtRootCbf(0);
+            else if (cuGeom.log2CUSize >= MAX_LOG2_CU_SIZE - 1 && m_param->recursionSkipMode == EDGE_BASED_RSKIP)
+                skipRecursion = md.bestMode && complexityCheckCU(*md.bestMode);
         }
         if (m_param->bAnalysisType == AVC_INFO && md.bestMode && cuGeom.numPartitions <= 16 && m_param->analysisLoadReuseLevel == 7)
             skipRecursion = true;
@@ -3525,27 +3535,47 @@
 
 bool Analysis::complexityCheckCU(const Mode& bestMode)
 {
-    uint32_t mean = 0;
-    uint32_t homo = 0;
-    uint32_t cuSize = bestMode.fencYuv->m_size;
-    for (uint32_t y = 0; y < cuSize; y++) {
-        for (uint32_t x = 0; x < cuSize; x++) {
-            mean += (bestMode.fencYuv->m_buf[0][y * cuSize + x]);
+    if (m_param->recursionSkipMode == RDCOST_BASED_RSKIP)
+    {
+        uint32_t mean = 0;
+        uint32_t homo = 0;
+        uint32_t cuSize = bestMode.fencYuv->m_size;
+        for (uint32_t y = 0; y < cuSize; y++) {
+            for (uint32_t x = 0; x < cuSize; x++) {
+                mean += (bestMode.fencYuv->m_buf[0][y * cuSize + x]);
+            }
         }
-    }
-    mean = mean / (cuSize * cuSize);
-    for (uint32_t y = 0 ; y < cuSize; y++){
-        for (uint32_t x = 0 ; x < cuSize; x++){
-            homo += abs(int(bestMode.fencYuv->m_buf[0][y * cuSize + x] - mean));
+        mean = mean / (cuSize * cuSize);
+        for (uint32_t y = 0; y < cuSize; y++) {
+            for (uint32_t x = 0; x < cuSize; x++) {
+                homo += abs(int(bestMode.fencYuv->m_buf[0][y * cuSize + x] - mean));
+            }
         }
-    }
-    homo = homo / (cuSize * cuSize);
+        homo = homo / (cuSize * cuSize);
 
-    if (homo < (.1 * mean))
-        return true;
+        if (homo < (.1 * mean))
+            return true;
 
-    return false;
-}
+        return false;
+    }
+    else
+    {
+        int blockType = bestMode.cu.m_log2CUSize[0] - LOG2_UNIT_SIZE;
+        int shift = bestMode.cu.m_log2CUSize[0] * LOG2_UNIT_SIZE;
+        intptr_t stride = m_frame->m_fencPic->m_stride;
+        intptr_t blockOffsetLuma = bestMode.cu.m_cuPelX + bestMode.cu.m_cuPelY * stride;
+        uint64_t sum_ss = primitives.cu[blockType].var(m_frame->m_edgeBitPic + blockOffsetLuma, stride);
+        uint32_t sum = (uint32_t)sum_ss;
+        uint32_t ss = (uint32_t)(sum_ss >> 32);
+        uint32_t pixelCount = 1 << shift;
+        double cuEdgeVariance = (ss - ((double)sum * sum / pixelCount)) / pixelCount;
+
+        if (cuEdgeVariance > (double)m_param->edgeVarThreshold)
+            return false;
+        else
+            return true;
+    }
+ }
 
 uint32_t Analysis::calculateCUVariance(const CUData& ctu, const CUGeom& cuGeom)
 {
@@ -3570,7 +3600,6 @@
             cnt++;
         }
     }
-    
     return cuVariance / cnt;
 }
 
​

x265_3.3.tar.gz/source/encoder/analysis.h -> x265_3.4.tar.gz/source/encoder/analysis.h Changed

 
@@ -52,7 +52,7 @@
         splitRefs = 0;
         mvCost[0] = 0; // L0
         mvCost[1] = 0; // L1
-        sa8dCost    = 0;
+        sa8dCost  = 0;
     }
 };
 
@@ -120,7 +120,6 @@
 
     Mode& compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, const Entropy& initialContext);
     int32_t loadTUDepth(CUGeom cuGeom, CUData parentCTU);
-
 protected:
     /* Analysis data for save/load mode, writes/reads data based on absPartIdx */
     x265_analysis_inter_data*  m_reuseInterDataCTU;
​

x265_3.3.tar.gz/source/encoder/api.cpp -> x265_3.4.tar.gz/source/encoder/api.cpp Changed

@@ -1016,12 +1016,12 @@
 
 void x265_zone_free(x265_param *param)
 {
-    if (param && param->rc.zonefileCount) {
+    if (param && param->rc.zones && (param->rc.zoneCount || param->rc.zonefileCount))
+    {
         for (int i = 0; i < param->rc.zonefileCount; i++)
             x265_free(param->rc.zones[i].zoneParam);
-    }
-    if (param && (param->rc.zoneCount || param->rc.zonefileCount))
         x265_free(param->rc.zones);
+    }
 }
 
 static const x265_api libapi =
@@ -1294,6 +1294,8 @@
                     fprintf(csvfp, "RateFactor, ");
                 if (param->rc.vbvBufferSize)
                     fprintf(csvfp, "BufferFill, BufferFillFinal, ");
+                if (param->rc.vbvBufferSize && param->csvLogLevel >= 2)
+                    fprintf(csvfp, "UnclippedBufferFillFinal, ");
                 if (param->bEnablePsnr)
                     fprintf(csvfp, "Y PSNR, U PSNR, V PSNR, YUV PSNR, ");
                 if (param->bEnableSsim)
@@ -1405,6 +1407,8 @@
         fprintf(param->csvfpt, "%.3lf,", frameStats->rateFactor);
     if (param->rc.vbvBufferSize)
         fprintf(param->csvfpt, "%.3lf, %.3lf,", frameStats->bufferFill, frameStats->bufferFillFinal);
+    if (param->rc.vbvBufferSize && param->csvLogLevel >= 2)
+        fprintf(param->csvfpt, "%.3lf,", frameStats->unclippedBufferFillFinal);
     if (param->bEnablePsnr)
         fprintf(param->csvfpt, "%.3lf, %.3lf, %.3lf, %.3lf,", frameStats->psnrY, frameStats->psnrU, frameStats->psnrV, frameStats->psnr);
     if (param->bEnableSsim)

 
@@ -1016,12 +1016,12 @@
 
 void x265_zone_free(x265_param *param)
 {
-    if (param && param->rc.zonefileCount) {
+    if (param && param->rc.zones && (param->rc.zoneCount || param->rc.zonefileCount))
+    {
         for (int i = 0; i < param->rc.zonefileCount; i++)
             x265_free(param->rc.zones[i].zoneParam);
-    }
-    if (param && (param->rc.zoneCount || param->rc.zonefileCount))
         x265_free(param->rc.zones);
+    }
 }
 
 static const x265_api libapi =
@@ -1294,6 +1294,8 @@
                     fprintf(csvfp, "RateFactor, ");
                 if (param->rc.vbvBufferSize)
                     fprintf(csvfp, "BufferFill, BufferFillFinal, ");
+                if (param->rc.vbvBufferSize && param->csvLogLevel >= 2)
+                    fprintf(csvfp, "UnclippedBufferFillFinal, ");
                 if (param->bEnablePsnr)
                     fprintf(csvfp, "Y PSNR, U PSNR, V PSNR, YUV PSNR, ");
                 if (param->bEnableSsim)
@@ -1405,6 +1407,8 @@
         fprintf(param->csvfpt, "%.3lf,", frameStats->rateFactor);
     if (param->rc.vbvBufferSize)
         fprintf(param->csvfpt, "%.3lf, %.3lf,", frameStats->bufferFill, frameStats->bufferFillFinal);
+    if (param->rc.vbvBufferSize && param->csvLogLevel >= 2)
+        fprintf(param->csvfpt, "%.3lf,", frameStats->unclippedBufferFillFinal);
     if (param->bEnablePsnr)
         fprintf(param->csvfpt, "%.3lf, %.3lf, %.3lf, %.3lf,", frameStats->psnrY, frameStats->psnrU, frameStats->psnrV, frameStats->psnr);
     if (param->bEnableSsim)
​

x265_3.3.tar.gz/source/encoder/encoder.cpp -> x265_3.4.tar.gz/source/encoder/encoder.cpp Changed

@@ -218,10 +218,7 @@
 
     if (m_param->bHistBasedSceneCut)
     {
-        for (int i = 0; i < x265_cli_csps[m_param->internalCsp].planes; i++)
-        {
-            m_planeSizes[i] = (m_param->sourceWidth >> x265_cli_csps[p->internalCsp].width[i]) * (m_param->sourceHeight >> x265_cli_csps[m_param->internalCsp].height[i]);
-        }
+        m_planeSizes[0] = (m_param->sourceWidth >> x265_cli_csps[p->internalCsp].width[0]) * (m_param->sourceHeight >> x265_cli_csps[m_param->internalCsp].height[0]);
         uint32_t pixelbytes = m_param->internalBitDepth > 8 ? 2 : 1;
         m_edgePic = X265_MALLOC(pixel, m_planeSizes[0] * pixelbytes);
         m_edgeHistThreshold = m_param->edgeTransitionThreshold;
@@ -1443,9 +1440,9 @@
     int32_t planeCount = x265_cli_csps[m_param->internalCsp].planes;
     memset(m_edgePic, 0, bufSize);
 
-    if (!computeEdge(m_edgePic, src, NULL, pic->width, pic->height, pic->width, false))
+    if (!computeEdge(m_edgePic, src, NULL, pic->width, pic->height, pic->width, false, 1))
     {
-        x265_log(m_param, X265_LOG_ERROR, "Failed edge computation!");
+        x265_log(m_param, X265_LOG_ERROR, "Failed to compute edge!");
         return false;
     }
 
@@ -1605,6 +1602,14 @@
         if (m_param->bHistBasedSceneCut && pic_in)
         {
             x265_picture *pic = (x265_picture *) pic_in;
+
+            if (pic->poc == 0)
+            {
+                /* for entire encode compute the chroma plane sizes only once */
+                for (int i = 1; i < x265_cli_csps[m_param->internalCsp].planes; i++)
+                    m_planeSizes[i] = (pic->width >> x265_cli_csps[m_param->internalCsp].width[i]) * (pic->height >> x265_cli_csps[m_param->internalCsp].height[i]);
+            }
+
             if (computeHistograms(pic))
             {
                 double maxUVSad = 0.0, edgeSad = 0.0;
@@ -1752,6 +1757,12 @@
                         }
                     }
                 }
+                if (m_param->recursionSkipMode == EDGE_BASED_RSKIP && m_param->bHistBasedSceneCut)
+                {
+                    pixel* src = m_edgePic;
+                    primitives.planecopy_pp_shr(src, inFrame->m_fencPic->m_picWidth, inFrame->m_edgeBitPic, inFrame->m_fencPic->m_stride,
+                        inFrame->m_fencPic->m_picWidth, inFrame->m_fencPic->m_picHeight, 0);
+                }
             }
             else
             {
@@ -2414,7 +2425,7 @@
         encParam->maxNumReferences = param->maxNumReferences; // never uses more refs than specified in stream headers
         encParam->bEnableFastIntra = param->bEnableFastIntra;
         encParam->bEnableEarlySkip = param->bEnableEarlySkip;
-        encParam->bEnableRecursionSkip = param->bEnableRecursionSkip;
+        encParam->recursionSkipMode = param->recursionSkipMode;
         encParam->searchMethod = param->searchMethod;
         /* Scratch buffer prevents me_range from being increased for esa/tesa */
         if (param->searchRange < encParam->searchRange)
@@ -3006,6 +3017,8 @@
             frameStats->ipCostRatio = curFrame->m_lowres.ipCostRatio;
         frameStats->bufferFill = m_rateControl->m_bufferFillActual;
         frameStats->bufferFillFinal = m_rateControl->m_bufferFillFinal;
+        if (m_param->csvLogLevel >= 2)
+            frameStats->unclippedBufferFillFinal = m_rateControl->m_unclippedBufferFillFinal;
         frameStats->frameLatency = inPoc - poc;
         if (m_param->rc.rateControlMode == X265_RC_CRF)
             frameStats->rateFactor = curEncData.m_rateFactor;
@@ -3400,7 +3413,7 @@
         p->maxNumReferences = zone->maxNumReferences;
         p->bEnableFastIntra = zone->bEnableFastIntra;
         p->bEnableEarlySkip = zone->bEnableEarlySkip;
-        p->bEnableRecursionSkip = zone->bEnableRecursionSkip;
+        p->recursionSkipMode = zone->recursionSkipMode;
         p->searchMethod = zone->searchMethod;
         p->searchRange = zone->searchRange;
         p->subpelRefine = zone->subpelRefine;
@@ -3681,20 +3694,6 @@
     if (p->analysisLoad && !p->analysisLoadReuseLevel)
         p->analysisLoadReuseLevel = 5;
 
-    if ((p->bAnalysisType == DEFAULT) && p->rc.cuTree)
-    {
-        if (p->analysisSaveReuseLevel && p->analysisSaveReuseLevel < 10)
-        {
-            x265_log(p, X265_LOG_WARNING, "cu-tree works only with analysis-save-reuse-level 10, Disabling cu-tree\n");
-            p->rc.cuTree = 0;
-        }
-        if (p->analysisLoadReuseLevel && p->analysisLoadReuseLevel < 10)
-        {
-            x265_log(p, X265_LOG_WARNING, "cu-tree works only with analysis-load-reuse-level 10, Disabling cu-tree\n");
-            p->rc.cuTree = 0;
-        }
-    }
-
     if ((p->analysisLoad || p->analysisSave) && (p->bDistributeModeAnalysis || p->bDistributeMotionEstimation))
     {
         x265_log(p, X265_LOG_WARNING, "Analysis load/save options incompatible with pmode/pme, Disabling pmode/pme\n");
@@ -3867,29 +3866,30 @@
         }
         else
         {
-            if (fread(&m_conformanceWindow.rightOffset, sizeof(int), 1, m_analysisFileIn) != 1)
+            int rightOffset, bottomOffset;
+            if (fread(&rightOffset, sizeof(int), 1, m_analysisFileIn) != 1)
             {
                 x265_log(NULL, X265_LOG_ERROR, "Error reading analysis data. Conformance window right offset missing\n");
                 m_aborted = true;
             }
-            else if (m_conformanceWindow.rightOffset && p->analysisLoadReuseLevel > 1)
+            else if (rightOffset && p->analysisLoadReuseLevel > 1)
             {
                 int scaleFactor = p->scaleFactor < 2 ? 1 : p->scaleFactor;
-                padsize = m_conformanceWindow.rightOffset * scaleFactor;
+                padsize = rightOffset * scaleFactor;
                 p->sourceWidth += padsize;
                 m_conformanceWindow.bEnabled = true;
                 m_conformanceWindow.rightOffset = padsize;
             }
 
-            if (fread(&m_conformanceWindow.bottomOffset, sizeof(int), 1, m_analysisFileIn) != 1)
+            if (fread(&bottomOffset, sizeof(int), 1, m_analysisFileIn) != 1)
             {
                 x265_log(NULL, X265_LOG_ERROR, "Error reading analysis data. Conformance window bottom offset missing\n");
                 m_aborted = true;
             }
-            else if (m_conformanceWindow.bottomOffset && p->analysisLoadReuseLevel > 1)
+            else if (bottomOffset && p->analysisLoadReuseLevel > 1)
             {
                 int scaleFactor = p->scaleFactor < 2 ? 1 : p->scaleFactor;
-                padsize = m_conformanceWindow.bottomOffset * scaleFactor;
+                padsize = bottomOffset * scaleFactor;
                 p->sourceHeight += padsize;
                 m_conformanceWindow.bEnabled = true;
                 m_conformanceWindow.bottomOffset = padsize;
@@ -4196,7 +4196,7 @@
         x265_log(p, X265_LOG_WARNING, "Radl requires fixed gop-length (keyint == min-keyint). Disabling radl.\n");
     }
 
-    if ((p->chunkStart || p->chunkEnd) && p->bOpenGOP)
+    if ((p->chunkStart || p->chunkEnd) && p->bOpenGOP && m_param->bResetZoneConfig)
     {
         p->chunkStart = p->chunkEnd = 0;
         x265_log(p, X265_LOG_WARNING, "Chunking requires closed gop structure. Disabling chunking.\n");
@@ -4229,12 +4229,6 @@
         x265_log(p, X265_LOG_WARNING, "Turning on repeat - headers for zone encoding\n");
     }
 
-    if (!m_param->bResetZoneConfig && (p->keyframeMax != p->keyframeMin))
-        x265_log(p, X265_LOG_WARNING, "External zone reconfiguration requires a fixed GOP size to enable appropriate signaling of HRD info\n");
-
-    if (!m_param->bResetZoneConfig && (p->reconfigWindowSize != (uint64_t)p->keyframeMax))
-        x265_log(p, X265_LOG_WARNING, "Zone size must be multiple of GOP size to enable appropriate signaling of HRD info\n");
-
     if (m_param->bEnableHME)
     {
         if (m_param->sourceHeight < 540)
@@ -4311,18 +4305,27 @@
         }
     }
 
+    uint32_t numCUsLoad, numCUsInHeightLoad;
+
     /* Now arrived at the right frame, read the record */
     analysis->poc = poc;
     analysis->frameRecordSize = frameRecordSize;
     X265_FREAD(&analysis->sliceType, sizeof(int), 1, m_analysisFileIn, &(picData->sliceType));
     X265_FREAD(&analysis->bScenecut, sizeof(int), 1, m_analysisFileIn, &(picData->bScenecut));
     X265_FREAD(&analysis->satdCost, sizeof(int64_t), 1, m_analysisFileIn, &(picData->satdCost));
-    X265_FREAD(&analysis->numCUsInFrame, sizeof(int), 1, m_analysisFileIn, &(picData->numCUsInFrame));
+    X265_FREAD(&numCUsLoad, sizeof(int), 1, m_analysisFileIn, &(picData->numCUsInFrame));
     X265_FREAD(&analysis->numPartitions, sizeof(int), 1, m_analysisFileIn, &(picData->numPartitions));
 
+    /* Update analysis info to save current settings */
+    uint32_t widthInCU = (m_param->sourceWidth + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize;
+    uint32_t heightInCU = (m_param->sourceHeight + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize;
+    uint32_t numCUsInFrame = widthInCU * heightInCU;
+    analysis->numCUsInFrame = numCUsInFrame;
+    analysis->numCuInHeight = heightInCU;
+
     if (m_param->bDisableLookahead)
     {
-        X265_FREAD(&analysis->numCuInHeight, sizeof(uint32_t), 1, m_analysisFileIn, &(picData->numCuInHeight));
+        X265_FREAD(&numCUsInHeightLoad, sizeof(uint32_t), 1, m_analysisFileIn, &(picData->numCuInHeight));
         X265_FREAD(&analysis->lookahead, sizeof(x265_lookahead_data), 1, m_analysisFileIn, &(picData->lookahead));
     }
     int scaledNumPartition = analysis->numPartitions;
@@ -4335,16 +4338,16 @@
 
     if (m_param->ctuDistortionRefine == CTU_DISTORTION_INTERNAL)
     {
-        X265_FREAD((analysis->distortionData)->ctuDistortion, sizeof(sse_t), analysis->numCUsInFrame, m_analysisFileIn, picDistortion);
+        X265_FREAD((analysis->distortionData)->ctuDistortion, sizeof(sse_t), numCUsLoad, m_analysisFileIn, picDistortion);
         computeDistortionOffset(analysis);
     }
     if (m_param->bDisableLookahead && m_rateControl->m_isVbv)
     {
         size_t vbvCount = m_param->lookaheadDepth + m_param->bframes + 2;

 
@@ -218,10 +218,7 @@
 
     if (m_param->bHistBasedSceneCut)
     {
-        for (int i = 0; i < x265_cli_csps[m_param->internalCsp].planes; i++)
-        {
-            m_planeSizes[i] = (m_param->sourceWidth >> x265_cli_csps[p->internalCsp].width[i]) * (m_param->sourceHeight >> x265_cli_csps[m_param->internalCsp].height[i]);
-        }
+        m_planeSizes[0] = (m_param->sourceWidth >> x265_cli_csps[p->internalCsp].width[0]) * (m_param->sourceHeight >> x265_cli_csps[m_param->internalCsp].height[0]);
         uint32_t pixelbytes = m_param->internalBitDepth > 8 ? 2 : 1;
         m_edgePic = X265_MALLOC(pixel, m_planeSizes[0] * pixelbytes);
         m_edgeHistThreshold = m_param->edgeTransitionThreshold;
@@ -1443,9 +1440,9 @@
     int32_t planeCount = x265_cli_csps[m_param->internalCsp].planes;
     memset(m_edgePic, 0, bufSize);
 
-    if (!computeEdge(m_edgePic, src, NULL, pic->width, pic->height, pic->width, false))
+    if (!computeEdge(m_edgePic, src, NULL, pic->width, pic->height, pic->width, false, 1))
     {
-        x265_log(m_param, X265_LOG_ERROR, "Failed edge computation!");
+        x265_log(m_param, X265_LOG_ERROR, "Failed to compute edge!");
         return false;
     }
 
@@ -1605,6 +1602,14 @@
         if (m_param->bHistBasedSceneCut && pic_in)
         {
             x265_picture *pic = (x265_picture *) pic_in;
+
+            if (pic->poc == 0)
+            {
+                /* for entire encode compute the chroma plane sizes only once */
+                for (int i = 1; i < x265_cli_csps[m_param->internalCsp].planes; i++)
+                    m_planeSizes[i] = (pic->width >> x265_cli_csps[m_param->internalCsp].width[i]) * (pic->height >> x265_cli_csps[m_param->internalCsp].height[i]);
+            }
+
             if (computeHistograms(pic))
             {
                 double maxUVSad = 0.0, edgeSad = 0.0;
@@ -1752,6 +1757,12 @@
                         }
                     }
                 }
+                if (m_param->recursionSkipMode == EDGE_BASED_RSKIP && m_param->bHistBasedSceneCut)
+                {
+                    pixel* src = m_edgePic;
+                    primitives.planecopy_pp_shr(src, inFrame->m_fencPic->m_picWidth, inFrame->m_edgeBitPic, inFrame->m_fencPic->m_stride,
+                        inFrame->m_fencPic->m_picWidth, inFrame->m_fencPic->m_picHeight, 0);
+                }
             }
             else
             {
@@ -2414,7 +2425,7 @@
         encParam->maxNumReferences = param->maxNumReferences; // never uses more refs than specified in stream headers
         encParam->bEnableFastIntra = param->bEnableFastIntra;
         encParam->bEnableEarlySkip = param->bEnableEarlySkip;
-        encParam->bEnableRecursionSkip = param->bEnableRecursionSkip;
+        encParam->recursionSkipMode = param->recursionSkipMode;
         encParam->searchMethod = param->searchMethod;
         /* Scratch buffer prevents me_range from being increased for esa/tesa */
         if (param->searchRange < encParam->searchRange)
@@ -3006,6 +3017,8 @@
             frameStats->ipCostRatio = curFrame->m_lowres.ipCostRatio;
         frameStats->bufferFill = m_rateControl->m_bufferFillActual;
         frameStats->bufferFillFinal = m_rateControl->m_bufferFillFinal;
+        if (m_param->csvLogLevel >= 2)
+            frameStats->unclippedBufferFillFinal = m_rateControl->m_unclippedBufferFillFinal;
         frameStats->frameLatency = inPoc - poc;
         if (m_param->rc.rateControlMode == X265_RC_CRF)
             frameStats->rateFactor = curEncData.m_rateFactor;
@@ -3400,7 +3413,7 @@
         p->maxNumReferences = zone->maxNumReferences;
         p->bEnableFastIntra = zone->bEnableFastIntra;
         p->bEnableEarlySkip = zone->bEnableEarlySkip;
-        p->bEnableRecursionSkip = zone->bEnableRecursionSkip;
+        p->recursionSkipMode = zone->recursionSkipMode;
         p->searchMethod = zone->searchMethod;
         p->searchRange = zone->searchRange;
         p->subpelRefine = zone->subpelRefine;
@@ -3681,20 +3694,6 @@
     if (p->analysisLoad && !p->analysisLoadReuseLevel)
         p->analysisLoadReuseLevel = 5;
 
-    if ((p->bAnalysisType == DEFAULT) && p->rc.cuTree)
-    {
-        if (p->analysisSaveReuseLevel && p->analysisSaveReuseLevel < 10)
-        {
-            x265_log(p, X265_LOG_WARNING, "cu-tree works only with analysis-save-reuse-level 10, Disabling cu-tree\n");
-            p->rc.cuTree = 0;
-        }
-        if (p->analysisLoadReuseLevel && p->analysisLoadReuseLevel < 10)
-        {
-            x265_log(p, X265_LOG_WARNING, "cu-tree works only with analysis-load-reuse-level 10, Disabling cu-tree\n");
-            p->rc.cuTree = 0;
-        }
-    }
-
     if ((p->analysisLoad || p->analysisSave) && (p->bDistributeModeAnalysis || p->bDistributeMotionEstimation))
     {
         x265_log(p, X265_LOG_WARNING, "Analysis load/save options incompatible with pmode/pme, Disabling pmode/pme\n");
@@ -3867,29 +3866,30 @@
         }
         else
         {
-            if (fread(&m_conformanceWindow.rightOffset, sizeof(int), 1, m_analysisFileIn) != 1)
+            int rightOffset, bottomOffset;
+            if (fread(&rightOffset, sizeof(int), 1, m_analysisFileIn) != 1)
             {
                 x265_log(NULL, X265_LOG_ERROR, "Error reading analysis data. Conformance window right offset missing\n");
                 m_aborted = true;
             }
-            else if (m_conformanceWindow.rightOffset && p->analysisLoadReuseLevel > 1)
+            else if (rightOffset && p->analysisLoadReuseLevel > 1)
             {
                 int scaleFactor = p->scaleFactor < 2 ? 1 : p->scaleFactor;
-                padsize = m_conformanceWindow.rightOffset * scaleFactor;
+                padsize = rightOffset * scaleFactor;
                 p->sourceWidth += padsize;
                 m_conformanceWindow.bEnabled = true;
                 m_conformanceWindow.rightOffset = padsize;
             }
 
-            if (fread(&m_conformanceWindow.bottomOffset, sizeof(int), 1, m_analysisFileIn) != 1)
+            if (fread(&bottomOffset, sizeof(int), 1, m_analysisFileIn) != 1)
             {
                 x265_log(NULL, X265_LOG_ERROR, "Error reading analysis data. Conformance window bottom offset missing\n");
                 m_aborted = true;
             }
-            else if (m_conformanceWindow.bottomOffset && p->analysisLoadReuseLevel > 1)
+            else if (bottomOffset && p->analysisLoadReuseLevel > 1)
             {
                 int scaleFactor = p->scaleFactor < 2 ? 1 : p->scaleFactor;
-                padsize = m_conformanceWindow.bottomOffset * scaleFactor;
+                padsize = bottomOffset * scaleFactor;
                 p->sourceHeight += padsize;
                 m_conformanceWindow.bEnabled = true;
                 m_conformanceWindow.bottomOffset = padsize;
@@ -4196,7 +4196,7 @@
         x265_log(p, X265_LOG_WARNING, "Radl requires fixed gop-length (keyint == min-keyint). Disabling radl.\n");
     }
 
-    if ((p->chunkStart || p->chunkEnd) && p->bOpenGOP)
+    if ((p->chunkStart || p->chunkEnd) && p->bOpenGOP && m_param->bResetZoneConfig)
     {
         p->chunkStart = p->chunkEnd = 0;
         x265_log(p, X265_LOG_WARNING, "Chunking requires closed gop structure. Disabling chunking.\n");
@@ -4229,12 +4229,6 @@
         x265_log(p, X265_LOG_WARNING, "Turning on repeat - headers for zone encoding\n");
     }
 
-    if (!m_param->bResetZoneConfig && (p->keyframeMax != p->keyframeMin))
-        x265_log(p, X265_LOG_WARNING, "External zone reconfiguration requires a fixed GOP size to enable appropriate signaling of HRD info\n");
-
-    if (!m_param->bResetZoneConfig && (p->reconfigWindowSize != (uint64_t)p->keyframeMax))
-        x265_log(p, X265_LOG_WARNING, "Zone size must be multiple of GOP size to enable appropriate signaling of HRD info\n");
-
     if (m_param->bEnableHME)
     {
         if (m_param->sourceHeight < 540)
@@ -4311,18 +4305,27 @@
         }
     }
 
+    uint32_t numCUsLoad, numCUsInHeightLoad;
+
     /* Now arrived at the right frame, read the record */
     analysis->poc = poc;
     analysis->frameRecordSize = frameRecordSize;
     X265_FREAD(&analysis->sliceType, sizeof(int), 1, m_analysisFileIn, &(picData->sliceType));
     X265_FREAD(&analysis->bScenecut, sizeof(int), 1, m_analysisFileIn, &(picData->bScenecut));
     X265_FREAD(&analysis->satdCost, sizeof(int64_t), 1, m_analysisFileIn, &(picData->satdCost));
-    X265_FREAD(&analysis->numCUsInFrame, sizeof(int), 1, m_analysisFileIn, &(picData->numCUsInFrame));
+    X265_FREAD(&numCUsLoad, sizeof(int), 1, m_analysisFileIn, &(picData->numCUsInFrame));
     X265_FREAD(&analysis->numPartitions, sizeof(int), 1, m_analysisFileIn, &(picData->numPartitions));
 
+    /* Update analysis info to save current settings */
+    uint32_t widthInCU = (m_param->sourceWidth + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize;
+    uint32_t heightInCU = (m_param->sourceHeight + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize;
+    uint32_t numCUsInFrame = widthInCU * heightInCU;
+    analysis->numCUsInFrame = numCUsInFrame;
+    analysis->numCuInHeight = heightInCU;
+
     if (m_param->bDisableLookahead)
     {
-        X265_FREAD(&analysis->numCuInHeight, sizeof(uint32_t), 1, m_analysisFileIn, &(picData->numCuInHeight));
+        X265_FREAD(&numCUsInHeightLoad, sizeof(uint32_t), 1, m_analysisFileIn, &(picData->numCuInHeight));
         X265_FREAD(&analysis->lookahead, sizeof(x265_lookahead_data), 1, m_analysisFileIn, &(picData->lookahead));
     }
     int scaledNumPartition = analysis->numPartitions;
@@ -4335,16 +4338,16 @@
 
     if (m_param->ctuDistortionRefine == CTU_DISTORTION_INTERNAL)
     {
-        X265_FREAD((analysis->distortionData)->ctuDistortion, sizeof(sse_t), analysis->numCUsInFrame, m_analysisFileIn, picDistortion);
+        X265_FREAD((analysis->distortionData)->ctuDistortion, sizeof(sse_t), numCUsLoad, m_analysisFileIn, picDistortion);
         computeDistortionOffset(analysis);
     }
     if (m_param->bDisableLookahead && m_rateControl->m_isVbv)
     {
         size_t vbvCount = m_param->lookaheadDepth + m_param->bframes + 2;
​

x265_3.3.tar.gz/source/encoder/frameencoder.cpp -> x265_3.4.tar.gz/source/encoder/frameencoder.cpp Changed

@@ -130,7 +130,7 @@
         {
             rowSum += sliceGroupSizeAccu;
             m_sliceBaseRow[++sidx] = i;
-        }        
+        }
     }
     X265_CHECK(sidx < m_param->maxSlices, "sliceID check failed!");
     m_sliceBaseRow[0] = 0;
@@ -448,6 +448,18 @@
     m_ssimCnt = 0;
     memset(&(m_frame->m_encData->m_frameStats), 0, sizeof(m_frame->m_encData->m_frameStats));
 
+    if (!m_param->bHistBasedSceneCut && m_param->rc.aqMode != X265_AQ_EDGE && m_param->recursionSkipMode == EDGE_BASED_RSKIP)
+    {
+        int height = m_frame->m_fencPic->m_picHeight;
+        int width = m_frame->m_fencPic->m_picWidth;
+        intptr_t stride = m_frame->m_fencPic->m_stride;
+
+        if (!computeEdge(m_frame->m_edgeBitPic, m_frame->m_fencPic->m_picOrg[0], NULL, stride, height, width, false, 1))
+        {
+            x265_log(m_param, X265_LOG_ERROR, " Failed to compute edge !");
+        }
+    }
+
     /* Emit access unit delimiter unless this is the first frame and the user is
      * not repeating headers (since AUD is supposed to be the first NAL in the access
      * unit) */

 
@@ -130,7 +130,7 @@
         {
             rowSum += sliceGroupSizeAccu;
             m_sliceBaseRow[++sidx] = i;
-        }        
+        }
     }
     X265_CHECK(sidx < m_param->maxSlices, "sliceID check failed!");
     m_sliceBaseRow[0] = 0;
@@ -448,6 +448,18 @@
     m_ssimCnt = 0;
     memset(&(m_frame->m_encData->m_frameStats), 0, sizeof(m_frame->m_encData->m_frameStats));
 
+    if (!m_param->bHistBasedSceneCut && m_param->rc.aqMode != X265_AQ_EDGE && m_param->recursionSkipMode == EDGE_BASED_RSKIP)
+    {
+        int height = m_frame->m_fencPic->m_picHeight;
+        int width = m_frame->m_fencPic->m_picWidth;
+        intptr_t stride = m_frame->m_fencPic->m_stride;
+
+        if (!computeEdge(m_frame->m_edgeBitPic, m_frame->m_fencPic->m_picOrg[0], NULL, stride, height, width, false, 1))
+        {
+            x265_log(m_param, X265_LOG_ERROR, " Failed to compute edge !");
+        }
+    }
+
     /* Emit access unit delimiter unless this is the first frame and the user is
      * not repeating headers (since AUD is supposed to be the first NAL in the access
      * unit) */
​

x265_3.3.tar.gz/source/encoder/ratecontrol.cpp -> x265_3.4.tar.gz/source/encoder/ratecontrol.cpp Changed

@@ -269,7 +269,7 @@
         x265_log(m_param, X265_LOG_WARNING, "NAL HRD parameters require VBV parameters, ignored\n");
         m_param->bEmitHRDSEI = 0;
     }
-    m_isCbr = m_param->rc.rateControlMode == X265_RC_ABR && m_isVbv && !m_2pass && m_param->rc.vbvMaxBitrate <= m_param->rc.bitrate;
+    m_isCbr = m_param->rc.rateControlMode == X265_RC_ABR && m_isVbv && m_param->rc.vbvMaxBitrate <= m_param->rc.bitrate;
     if (m_param->rc.bStrictCbr && !m_isCbr)
     {
         x265_log(m_param, X265_LOG_WARNING, "strict CBR set without CBR mode, ignored\n");
@@ -335,7 +335,7 @@
         int vbvBufferSize = m_param->rc.vbvBufferSize * 1000;
         int vbvMaxBitrate = m_param->rc.vbvMaxBitrate * 1000;
 
-        if (m_param->bEmitHRDSEI)
+        if (m_param->bEmitHRDSEI && !m_param->decoderVbvMaxRate)
         {
             const HRDInfo* hrd = &sps.vuiParameters.hrdParameters;
             vbvBufferSize = hrd->cpbSizeValue << (hrd->cpbSizeScale + CPB_SHIFT);
@@ -509,6 +509,7 @@
                 CMP_OPT_FIRST_PASS(" keyint", m_param->keyframeMax);
                 CMP_OPT_FIRST_PASS("scenecut", m_param->scenecutThreshold);
                 CMP_OPT_FIRST_PASS("intra-refresh", m_param->bIntraRefresh);
+                CMP_OPT_FIRST_PASS("frame-dup", m_param->bEnableFrameDuplication);
                 if (m_param->bMultiPassOptRPS)
                 {
                     CMP_OPT_FIRST_PASS("multi-pass-opt-rps", m_param->bMultiPassOptRPS);
@@ -546,7 +547,7 @@
                 x265_log(m_param, X265_LOG_WARNING, "2nd pass has fewer frames than 1st pass (%d vs %d)\n",
                          m_param->totalFrames, m_numEntries);
             }
-            if (m_param->totalFrames > m_numEntries)
+            if (m_param->totalFrames > m_numEntries && !m_param->bEnableFrameDuplication)
             {
                 x265_log(m_param, X265_LOG_ERROR, "2nd pass has more frames than 1st pass (%d vs %d)\n",
                          m_param->totalFrames, m_numEntries);
@@ -781,6 +782,10 @@
     // Init HRD
     HRDInfo* hrd = &sps.vuiParameters.hrdParameters;
     hrd->cbrFlag = m_isCbr;
+    if (m_param->reconfigWindowSize) {
+        hrd->cbrFlag = 0;
+        vbvMaxBitrate = m_param->decoderVbvMaxRate * 1000;
+    }
 
     // normalize HRD size and rate to the value / scale notation
     hrd->bitRateScale = x265_clip3(0, 15, calcScale(vbvMaxBitrate) - BR_SHIFT);
@@ -829,7 +834,7 @@
         /* weighted average of cplx of future frames */
         for (int j = 1; j < cplxBlur * 2 && j < m_numEntries - i; j++)
         {
-            int index = m_encOrder[i + j];
+            int index = i+j;
             RateControlEntry *rcj = &m_rce2Pass[index];
             weight *= 1 - pow(rcj->iCuCount / m_ncu, 2);
             if (weight < 0.0001)
@@ -842,7 +847,7 @@
         weight = 1.0;
         for (int j = 0; j <= cplxBlur * 2 && j <= i; j++)
         {
-            int index = m_encOrder[i - j];
+            int index = i-j;
             RateControlEntry *rcj = &m_rce2Pass[index];
             gaussianWeight = weight * exp(-j * j / 200.0);
             weightSum += gaussianWeight;
@@ -851,7 +856,7 @@
             if (weight < .0001)
                 break;
         }
-        m_rce2Pass[m_encOrder[i]].blurredComplexity = cplxSum / weightSum;
+        m_rce2Pass[i].blurredComplexity= cplxSum / weightSum;
     }
     CHECKED_MALLOC(qScale, double, m_numEntries);
     if (filterSize > 1)
@@ -870,7 +875,7 @@
     expectedBits = 1;
     for (int i = 0; i < m_numEntries; i++)
     {
-        RateControlEntry* rce = &m_rce2Pass[m_encOrder[i]];
+        RateControlEntry* rce = &m_rce2Pass[i];
         double q = getQScale(rce, 1.0);
         expectedBits += qScale2bits(rce, q);
         m_lastQScaleFor[rce->sliceType] = q;
@@ -893,15 +898,15 @@
         /* find qscale */
         for (int i = 0; i < m_numEntries; i++)
         {
-            RateControlEntry *rce = &m_rce2Pass[m_encOrder[i]];
+            RateControlEntry *rce = &m_rce2Pass[i];
             qScale[i] = getQScale(rce, rateFactor);
             m_lastQScaleFor[rce->sliceType] = qScale[i];
         }
 
         /* fixed I/B qscale relative to P */
-        for (int i = m_numEntries - 1; i >= 0; i--)
+        for (int i = 0; i < m_numEntries; i++)
         {
-            qScale[i] = getDiffLimitedQScale(&m_rce2Pass[m_encOrder[i]], qScale[i]);
+            qScale[i] = getDiffLimitedQScale(&m_rce2Pass[i], qScale[i]);
             X265_CHECK(qScale[i] >= 0, "qScale became negative\n");
         }
 
@@ -912,7 +917,6 @@
             for (int i = 0; i < m_numEntries; i++)
             {
                 double q = 0.0, sum = 0.0;
-
                 for (int j = 0; j < filterSize; j++)
                 {
                     int idx = i + j - filterSize / 2;
@@ -920,7 +924,7 @@
                     double coeff = qBlur == 0 ? 1.0 : exp(-d * d / (qBlur * qBlur));
                     if (idx < 0 || idx >= m_numEntries)
                         continue;
-                    if (m_rce2Pass[m_encOrder[i]].sliceType != m_rce2Pass[m_encOrder[idx]].sliceType)
+                    if (m_rce2Pass[i].sliceType != m_rce2Pass[idx].sliceType)
                         continue;
                     q += qScale[idx] * coeff;
                     sum += coeff;
@@ -932,7 +936,7 @@
         /* find expected bits */
         for (int i = 0; i < m_numEntries; i++)
         {
-            RateControlEntry *rce = &m_rce2Pass[m_encOrder[i]];
+            RateControlEntry *rce = &m_rce2Pass[i];
             rce->newQScale = clipQscale(NULL, rce, blurredQscale[i]); // check if needed
             X265_CHECK(rce->newQScale >= 0, "new Qscale is negative\n");
             expectedBits += qScale2bits(rce, rce->newQScale);
@@ -1279,6 +1283,7 @@
                 m_param->rc.vbvMaxBitrate = m_param->rc.zones[i].zoneParam->rc.vbvMaxBitrate;
                 memcpy(m_relativeComplexity, m_param->rc.zones[i].relativeComplexity, sizeof(double) * m_param->reconfigWindowSize);
                 reconfigureRC();
+                m_isCbr = 1; /* Always vbvmaxrate == bitrate here*/
                 m_top->zoneReadCount[i].incr();
             }
         }
@@ -1951,7 +1956,7 @@
                 /* Adjust quant based on the difference between
                  * achieved and expected bitrate so far */
                 double curTime = (double)rce->encodeOrder / m_numEntries;
-                double w = x265_clip3(0.0, 1.0, curTime * 100);
+                double w = x265_clip3(0.0, 1.0, curTime);
                 q *= pow((double)m_totalBits / m_expectedBitsSum, w);
             }
             if (m_framesDone == 0 && m_param->rc.rateControlMode == X265_RC_ABR && m_isGrainEnabled)
@@ -2742,7 +2747,9 @@
         x265_log(m_param, X265_LOG_WARNING, "poc:%d, VBV underflow (%.0f bits)\n", rce->poc, m_bufferFillFinal);
 
     m_bufferFillFinal = X265_MAX(m_bufferFillFinal, 0);
-    m_bufferFillFinal += m_bufferRate;
+    m_bufferFillFinal += rce->bufferRate;
+    if (m_param->csvLogLevel >= 2)
+        m_unclippedBufferFillFinal = m_bufferFillFinal;
 
     if (m_param->rc.bStrictCbr)
     {
@@ -2752,14 +2759,14 @@
             filler += FILLER_OVERHEAD * 8;
         }
         m_bufferFillFinal -= filler;
-        bufferBits = X265_MIN(bits + filler + m_bufferExcess, m_bufferRate);
+        bufferBits = X265_MIN(bits + filler + m_bufferExcess, rce->bufferRate);
         m_bufferExcess = X265_MAX(m_bufferExcess - bufferBits + bits + filler, 0);
         m_bufferFillActual += bufferBits - bits - filler;
     }
     else
     {
         m_bufferFillFinal = X265_MIN(m_bufferFillFinal, m_bufferSize);
-        bufferBits = X265_MIN(bits + m_bufferExcess, m_bufferRate);
+        bufferBits = X265_MIN(bits + m_bufferExcess, rce->bufferRate);
         m_bufferExcess = X265_MAX(m_bufferExcess - bufferBits + bits, 0);
         m_bufferFillActual += bufferBits - bits;
         m_bufferFillActual = X265_MIN(m_bufferFillActual, m_bufferSize);

 
@@ -269,7 +269,7 @@
         x265_log(m_param, X265_LOG_WARNING, "NAL HRD parameters require VBV parameters, ignored\n");
         m_param->bEmitHRDSEI = 0;
     }
-    m_isCbr = m_param->rc.rateControlMode == X265_RC_ABR && m_isVbv && !m_2pass && m_param->rc.vbvMaxBitrate <= m_param->rc.bitrate;
+    m_isCbr = m_param->rc.rateControlMode == X265_RC_ABR && m_isVbv && m_param->rc.vbvMaxBitrate <= m_param->rc.bitrate;
     if (m_param->rc.bStrictCbr && !m_isCbr)
     {
         x265_log(m_param, X265_LOG_WARNING, "strict CBR set without CBR mode, ignored\n");
@@ -335,7 +335,7 @@
         int vbvBufferSize = m_param->rc.vbvBufferSize * 1000;
         int vbvMaxBitrate = m_param->rc.vbvMaxBitrate * 1000;
 
-        if (m_param->bEmitHRDSEI)
+        if (m_param->bEmitHRDSEI && !m_param->decoderVbvMaxRate)
         {
             const HRDInfo* hrd = &sps.vuiParameters.hrdParameters;
             vbvBufferSize = hrd->cpbSizeValue << (hrd->cpbSizeScale + CPB_SHIFT);
@@ -509,6 +509,7 @@
                 CMP_OPT_FIRST_PASS(" keyint", m_param->keyframeMax);
                 CMP_OPT_FIRST_PASS("scenecut", m_param->scenecutThreshold);
                 CMP_OPT_FIRST_PASS("intra-refresh", m_param->bIntraRefresh);
+                CMP_OPT_FIRST_PASS("frame-dup", m_param->bEnableFrameDuplication);
                 if (m_param->bMultiPassOptRPS)
                 {
                     CMP_OPT_FIRST_PASS("multi-pass-opt-rps", m_param->bMultiPassOptRPS);
@@ -546,7 +547,7 @@
                 x265_log(m_param, X265_LOG_WARNING, "2nd pass has fewer frames than 1st pass (%d vs %d)\n",
                          m_param->totalFrames, m_numEntries);
             }
-            if (m_param->totalFrames > m_numEntries)
+            if (m_param->totalFrames > m_numEntries && !m_param->bEnableFrameDuplication)
             {
                 x265_log(m_param, X265_LOG_ERROR, "2nd pass has more frames than 1st pass (%d vs %d)\n",
                          m_param->totalFrames, m_numEntries);
@@ -781,6 +782,10 @@
     // Init HRD
     HRDInfo* hrd = &sps.vuiParameters.hrdParameters;
     hrd->cbrFlag = m_isCbr;
+    if (m_param->reconfigWindowSize) {
+        hrd->cbrFlag = 0;
+        vbvMaxBitrate = m_param->decoderVbvMaxRate * 1000;
+    }
 
     // normalize HRD size and rate to the value / scale notation
     hrd->bitRateScale = x265_clip3(0, 15, calcScale(vbvMaxBitrate) - BR_SHIFT);
@@ -829,7 +834,7 @@
         /* weighted average of cplx of future frames */
         for (int j = 1; j < cplxBlur * 2 && j < m_numEntries - i; j++)
         {
-            int index = m_encOrder[i + j];
+            int index = i+j;
             RateControlEntry *rcj = &m_rce2Pass[index];
             weight *= 1 - pow(rcj->iCuCount / m_ncu, 2);
             if (weight < 0.0001)
@@ -842,7 +847,7 @@
         weight = 1.0;
         for (int j = 0; j <= cplxBlur * 2 && j <= i; j++)
         {
-            int index = m_encOrder[i - j];
+            int index = i-j;
             RateControlEntry *rcj = &m_rce2Pass[index];
             gaussianWeight = weight * exp(-j * j / 200.0);
             weightSum += gaussianWeight;
@@ -851,7 +856,7 @@
             if (weight < .0001)
                 break;
         }
-        m_rce2Pass[m_encOrder[i]].blurredComplexity = cplxSum / weightSum;
+        m_rce2Pass[i].blurredComplexity= cplxSum / weightSum;
     }
     CHECKED_MALLOC(qScale, double, m_numEntries);
     if (filterSize > 1)
@@ -870,7 +875,7 @@
     expectedBits = 1;
     for (int i = 0; i < m_numEntries; i++)
     {
-        RateControlEntry* rce = &m_rce2Pass[m_encOrder[i]];
+        RateControlEntry* rce = &m_rce2Pass[i];
         double q = getQScale(rce, 1.0);
         expectedBits += qScale2bits(rce, q);
         m_lastQScaleFor[rce->sliceType] = q;
@@ -893,15 +898,15 @@
         /* find qscale */
         for (int i = 0; i < m_numEntries; i++)
         {
-            RateControlEntry *rce = &m_rce2Pass[m_encOrder[i]];
+            RateControlEntry *rce = &m_rce2Pass[i];
             qScale[i] = getQScale(rce, rateFactor);
             m_lastQScaleFor[rce->sliceType] = qScale[i];
         }
 
         /* fixed I/B qscale relative to P */
-        for (int i = m_numEntries - 1; i >= 0; i--)
+        for (int i = 0; i < m_numEntries; i++)
         {
-            qScale[i] = getDiffLimitedQScale(&m_rce2Pass[m_encOrder[i]], qScale[i]);
+            qScale[i] = getDiffLimitedQScale(&m_rce2Pass[i], qScale[i]);
             X265_CHECK(qScale[i] >= 0, "qScale became negative\n");
         }
 
@@ -912,7 +917,6 @@
             for (int i = 0; i < m_numEntries; i++)
             {
                 double q = 0.0, sum = 0.0;
-
                 for (int j = 0; j < filterSize; j++)
                 {
                     int idx = i + j - filterSize / 2;
@@ -920,7 +924,7 @@
                     double coeff = qBlur == 0 ? 1.0 : exp(-d * d / (qBlur * qBlur));
                     if (idx < 0 || idx >= m_numEntries)
                         continue;
-                    if (m_rce2Pass[m_encOrder[i]].sliceType != m_rce2Pass[m_encOrder[idx]].sliceType)
+                    if (m_rce2Pass[i].sliceType != m_rce2Pass[idx].sliceType)
                         continue;
                     q += qScale[idx] * coeff;
                     sum += coeff;
@@ -932,7 +936,7 @@
         /* find expected bits */
         for (int i = 0; i < m_numEntries; i++)
         {
-            RateControlEntry *rce = &m_rce2Pass[m_encOrder[i]];
+            RateControlEntry *rce = &m_rce2Pass[i];
             rce->newQScale = clipQscale(NULL, rce, blurredQscale[i]); // check if needed
             X265_CHECK(rce->newQScale >= 0, "new Qscale is negative\n");
             expectedBits += qScale2bits(rce, rce->newQScale);
@@ -1279,6 +1283,7 @@
                 m_param->rc.vbvMaxBitrate = m_param->rc.zones[i].zoneParam->rc.vbvMaxBitrate;
                 memcpy(m_relativeComplexity, m_param->rc.zones[i].relativeComplexity, sizeof(double) * m_param->reconfigWindowSize);
                 reconfigureRC();
+                m_isCbr = 1; /* Always vbvmaxrate == bitrate here*/
                 m_top->zoneReadCount[i].incr();
             }
         }
@@ -1951,7 +1956,7 @@
                 /* Adjust quant based on the difference between
                  * achieved and expected bitrate so far */
                 double curTime = (double)rce->encodeOrder / m_numEntries;
-                double w = x265_clip3(0.0, 1.0, curTime * 100);
+                double w = x265_clip3(0.0, 1.0, curTime);
                 q *= pow((double)m_totalBits / m_expectedBitsSum, w);
             }
             if (m_framesDone == 0 && m_param->rc.rateControlMode == X265_RC_ABR && m_isGrainEnabled)
@@ -2742,7 +2747,9 @@
         x265_log(m_param, X265_LOG_WARNING, "poc:%d, VBV underflow (%.0f bits)\n", rce->poc, m_bufferFillFinal);
 
     m_bufferFillFinal = X265_MAX(m_bufferFillFinal, 0);
-    m_bufferFillFinal += m_bufferRate;
+    m_bufferFillFinal += rce->bufferRate;
+    if (m_param->csvLogLevel >= 2)
+        m_unclippedBufferFillFinal = m_bufferFillFinal;
 
     if (m_param->rc.bStrictCbr)
     {
@@ -2752,14 +2759,14 @@
             filler += FILLER_OVERHEAD * 8;
         }
         m_bufferFillFinal -= filler;
-        bufferBits = X265_MIN(bits + filler + m_bufferExcess, m_bufferRate);
+        bufferBits = X265_MIN(bits + filler + m_bufferExcess, rce->bufferRate);
         m_bufferExcess = X265_MAX(m_bufferExcess - bufferBits + bits + filler, 0);
         m_bufferFillActual += bufferBits - bits - filler;
     }
     else
     {
         m_bufferFillFinal = X265_MIN(m_bufferFillFinal, m_bufferSize);
-        bufferBits = X265_MIN(bits + m_bufferExcess, m_bufferRate);
+        bufferBits = X265_MIN(bits + m_bufferExcess, rce->bufferRate);
         m_bufferExcess = X265_MAX(m_bufferExcess - bufferBits + bits, 0);
         m_bufferFillActual += bufferBits - bits;
         m_bufferFillActual = X265_MIN(m_bufferFillActual, m_bufferSize);
​

x265_3.3.tar.gz/source/encoder/ratecontrol.h -> x265_3.4.tar.gz/source/encoder/ratecontrol.h Changed

 
@@ -157,6 +157,7 @@
     double m_rateFactorConstant;
     double m_bufferSize;
     double m_bufferFillFinal;  /* real buffer as of the last finished frame */
+    double m_unclippedBufferFillFinal; /* real unclipped buffer as of the last finished frame used to log in CSV*/
     double m_bufferFill;       /* planned buffer, if all in-progress frames hit their bit budget */
     double m_bufferRate;       /* # of bits added to buffer_fill after each frame */
     double m_vbvMaxRate;       /* in kbps */
​

x265_3.3.tar.gz/source/encoder/slicetype.cpp -> x265_3.4.tar.gz/source/encoder/slicetype.cpp Changed

@@ -87,7 +87,7 @@
 
 namespace X265_NS {
 
-bool computeEdge(pixel *edgePic, pixel *refPic, pixel *edgeTheta, intptr_t stride, int height, int width, bool bcalcTheta)
+bool computeEdge(pixel* edgePic, pixel* refPic, pixel* edgeTheta, intptr_t stride, int height, int width, bool bcalcTheta, pixel whitePixel)
 {
     intptr_t rowOne = 0, rowTwo = 0, rowThree = 0, colOne = 0, colTwo = 0, colThree = 0;
     intptr_t middle = 0, topLeft = 0, topRight = 0, bottomLeft = 0, bottomRight = 0;
@@ -141,7 +141,7 @@
                        theta = 180 + theta;
                     edgeTheta[middle] = (pixel)theta;
                 }
-                edgePic[middle] = (pixel)(gradientMagnitude >= edgeThreshold ? edgeThreshold : blackPixel);
+                edgePic[middle] = (pixel)(gradientMagnitude >= EDGE_THRESHOLD ? whitePixel : blackPixel);
             }
         }
         return true;
@@ -519,6 +519,13 @@
                 if (param->rc.aqMode == X265_AQ_EDGE)
                     edgeFilter(curFrame, param);
 
+                if (param->rc.aqMode == X265_AQ_EDGE && !param->bHistBasedSceneCut && param->recursionSkipMode == EDGE_BASED_RSKIP)
+                {
+                    pixel* src = curFrame->m_edgePic + curFrame->m_fencPic->m_lumaMarginY * curFrame->m_fencPic->m_stride + curFrame->m_fencPic->m_lumaMarginX;
+                    primitives.planecopy_pp_shr(src, curFrame->m_fencPic->m_stride, curFrame->m_edgeBitPic,
+                        curFrame->m_fencPic->m_stride, curFrame->m_fencPic->m_picWidth, curFrame->m_fencPic->m_picHeight, SHIFT_TO_BITPLANE);
+                }
+
                 if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE || param->rc.aqMode == X265_AQ_AUTO_VARIANCE_BIASED || param->rc.aqMode == X265_AQ_EDGE)
                 {
                     double bit_depth_correction = 1.f / (1 << (2 * (X265_DEPTH - 8)));

 
@@ -87,7 +87,7 @@
 
 namespace X265_NS {
 
-bool computeEdge(pixel *edgePic, pixel *refPic, pixel *edgeTheta, intptr_t stride, int height, int width, bool bcalcTheta)
+bool computeEdge(pixel* edgePic, pixel* refPic, pixel* edgeTheta, intptr_t stride, int height, int width, bool bcalcTheta, pixel whitePixel)
 {
     intptr_t rowOne = 0, rowTwo = 0, rowThree = 0, colOne = 0, colTwo = 0, colThree = 0;
     intptr_t middle = 0, topLeft = 0, topRight = 0, bottomLeft = 0, bottomRight = 0;
@@ -141,7 +141,7 @@
                        theta = 180 + theta;
                     edgeTheta[middle] = (pixel)theta;
                 }
-                edgePic[middle] = (pixel)(gradientMagnitude >= edgeThreshold ? edgeThreshold : blackPixel);
+                edgePic[middle] = (pixel)(gradientMagnitude >= EDGE_THRESHOLD ? whitePixel : blackPixel);
             }
         }
         return true;
@@ -519,6 +519,13 @@
                 if (param->rc.aqMode == X265_AQ_EDGE)
                     edgeFilter(curFrame, param);
 
+                if (param->rc.aqMode == X265_AQ_EDGE && !param->bHistBasedSceneCut && param->recursionSkipMode == EDGE_BASED_RSKIP)
+                {
+                    pixel* src = curFrame->m_edgePic + curFrame->m_fencPic->m_lumaMarginY * curFrame->m_fencPic->m_stride + curFrame->m_fencPic->m_lumaMarginX;
+                    primitives.planecopy_pp_shr(src, curFrame->m_fencPic->m_stride, curFrame->m_edgeBitPic,
+                        curFrame->m_fencPic->m_stride, curFrame->m_fencPic->m_picWidth, curFrame->m_fencPic->m_picHeight, SHIFT_TO_BITPLANE);
+                }
+
                 if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE || param->rc.aqMode == X265_AQ_AUTO_VARIANCE_BIASED || param->rc.aqMode == X265_AQ_EDGE)
                 {
                     double bit_depth_correction = 1.f / (1 << (2 * (X265_DEPTH - 8)));
​

x265_3.3.tar.gz/source/encoder/slicetype.h -> x265_3.4.tar.gz/source/encoder/slicetype.h Changed

@@ -44,9 +44,9 @@
 #define EDGE_INCLINATION 45
 
 #if HIGH_BIT_DEPTH
-#define edgeThreshold 1023.0
+#define EDGE_THRESHOLD 1023.0
 #else
-#define edgeThreshold 255.0
+#define EDGE_THRESHOLD 255.0
 #endif
 #define PI 3.14159265
 
@@ -101,7 +101,7 @@
 protected:
 
     uint32_t acEnergyCu(Frame* curFrame, uint32_t blockX, uint32_t blockY, int csp, uint32_t qgSize);
-    uint32_t edgeDensityCu(Frame*curFrame, uint32_t &avgAngle, uint32_t blockX, uint32_t blockY, uint32_t qgSize);
+    uint32_t edgeDensityCu(Frame* curFrame, uint32_t &avgAngle, uint32_t blockX, uint32_t blockY, uint32_t qgSize);
     uint32_t lumaSumCu(Frame* curFrame, uint32_t blockX, uint32_t blockY, uint32_t qgSize);
     uint32_t weightCostLuma(Lowres& fenc, Lowres& ref, WeightParam& wp);
     bool     allocWeightedRef(Lowres& fenc);
@@ -265,7 +265,6 @@
     CostEstimateGroup& operator=(const CostEstimateGroup&);
 };
 
-bool computeEdge(pixel *edgePic, pixel *refPic, pixel *edgeTheta, intptr_t stride, int height, int width, bool bcalcTheta);
-
+bool computeEdge(pixel* edgePic, pixel* refPic, pixel* edgeTheta, intptr_t stride, int height, int width, bool bcalcTheta, pixel whitePixel = EDGE_THRESHOLD);
 }
 #endif // ifndef X265_SLICETYPE_H

 
@@ -44,9 +44,9 @@
 #define EDGE_INCLINATION 45
 
 #if HIGH_BIT_DEPTH
-#define edgeThreshold 1023.0
+#define EDGE_THRESHOLD 1023.0
 #else
-#define edgeThreshold 255.0
+#define EDGE_THRESHOLD 255.0
 #endif
 #define PI 3.14159265
 
@@ -101,7 +101,7 @@
 protected:
 
     uint32_t acEnergyCu(Frame* curFrame, uint32_t blockX, uint32_t blockY, int csp, uint32_t qgSize);
-    uint32_t edgeDensityCu(Frame*curFrame, uint32_t &avgAngle, uint32_t blockX, uint32_t blockY, uint32_t qgSize);
+    uint32_t edgeDensityCu(Frame* curFrame, uint32_t &avgAngle, uint32_t blockX, uint32_t blockY, uint32_t qgSize);
     uint32_t lumaSumCu(Frame* curFrame, uint32_t blockX, uint32_t blockY, uint32_t qgSize);
     uint32_t weightCostLuma(Lowres& fenc, Lowres& ref, WeightParam& wp);
     bool     allocWeightedRef(Lowres& fenc);
@@ -265,7 +265,6 @@
     CostEstimateGroup& operator=(const CostEstimateGroup&);
 };
 
-bool computeEdge(pixel *edgePic, pixel *refPic, pixel *edgeTheta, intptr_t stride, int height, int width, bool bcalcTheta);
-
+bool computeEdge(pixel* edgePic, pixel* refPic, pixel* edgeTheta, intptr_t stride, int height, int width, bool bcalcTheta, pixel whitePixel = EDGE_THRESHOLD);
 }
 #endif // ifndef X265_SLICETYPE_H
​

x265_3.3.tar.gz/source/test/CMakeLists.txt -> x265_3.4.tar.gz/source/test/CMakeLists.txt Changed

 
@@ -23,13 +23,15 @@
 
 # add ARM assembly files
 if(ARM OR CROSS_COMPILE_ARM)
-    enable_language(ASM)
-    set(NASM_SRC checkasm-arm.S)
-    add_custom_command(
-        OUTPUT checkasm-arm.obj
-        COMMAND ${CMAKE_CXX_COMPILER}
-        ARGS ${NASM_FLAGS} ${CMAKE_CURRENT_SOURCE_DIR}/checkasm-arm.S -o checkasm-arm.obj
-        DEPENDS checkasm-arm.S)
+    if(NOT ARM64)
+        enable_language(ASM)
+        set(NASM_SRC checkasm-arm.S)
+        add_custom_command(
+            OUTPUT checkasm-arm.obj
+            COMMAND ${CMAKE_CXX_COMPILER}
+            ARGS ${NASM_FLAGS} ${CMAKE_CURRENT_SOURCE_DIR}/checkasm-arm.S -o checkasm-arm.obj
+            DEPENDS checkasm-arm.S)
+    endif()
 endif(ARM OR CROSS_COMPILE_ARM)
 
 # add PowerPC assembly files
​

x265_3.3.tar.gz/source/test/regression-tests.txt -> x265_3.4.tar.gz/source/test/regression-tests.txt Changed

@@ -75,7 +75,7 @@
 News-4k.y4m,--preset superfast --lookahead-slices 6 --aq-mode 0
 News-4k.y4m,--preset superfast --slices 4 --aq-mode 0 
 News-4k.y4m,--preset medium --tune ssim --no-sao --qg-size 16
-News-4k.y4m,--preset veryslow --no-rskip
+News-4k.y4m,--preset veryslow --rskip 0
 News-4k.y4m,--preset veryslow --pme --crf 40
 OldTownCross_1920x1080_50_10bit_422.yuv,--preset superfast --weightp
 OldTownCross_1920x1080_50_10bit_422.yuv,--preset medium --no-weightp
@@ -162,7 +162,11 @@
 sintel_trailer_2k_1920x1080_24.yuv, --preset medium --hist-scenecut --hist-threshold 0.02 --frame-dup --dup-threshold 60 --hrd --bitrate 10000 --vbv-bufsize 15000 --vbv-maxrate 12000
 sintel_trailer_2k_1920x1080_24.yuv, --preset medium --hist-scenecut --hist-threshold 0.02
 sintel_trailer_2k_1920x1080_24.yuv, --preset ultrafast --hist-scenecut --hist-threshold 0.02
-
+crowd_run_1920x1080_50.yuv, --preset faster --ctu 32 --rskip 2 --rskip-edge-threshold 5
+crowd_run_1920x1080_50.yuv, --preset fast --ctu 64 --rskip 2 --rskip-edge-threshold 5 --aq-mode 4
+crowd_run_1920x1080_50.yuv, --preset slow --ctu 32 --rskip 2 --rskip-edge-threshold 5 --hist-scenecut --hist-threshold 0.1
+crowd_run_1920x1080_50.yuv, --preset slower --ctu 16 --rskip 2 --rskip-edge-threshold 5 --hist-scenecut --hist-threshold 0.1 --aq-mode 4
+ 
 # Main12 intraCost overflow bug test
 720p50_parkrun_ter.y4m,--preset medium

 
@@ -75,7 +75,7 @@
 News-4k.y4m,--preset superfast --lookahead-slices 6 --aq-mode 0
 News-4k.y4m,--preset superfast --slices 4 --aq-mode 0 
 News-4k.y4m,--preset medium --tune ssim --no-sao --qg-size 16
-News-4k.y4m,--preset veryslow --no-rskip
+News-4k.y4m,--preset veryslow --rskip 0
 News-4k.y4m,--preset veryslow --pme --crf 40
 OldTownCross_1920x1080_50_10bit_422.yuv,--preset superfast --weightp
 OldTownCross_1920x1080_50_10bit_422.yuv,--preset medium --no-weightp
@@ -162,7 +162,11 @@
 sintel_trailer_2k_1920x1080_24.yuv, --preset medium --hist-scenecut --hist-threshold 0.02 --frame-dup --dup-threshold 60 --hrd --bitrate 10000 --vbv-bufsize 15000 --vbv-maxrate 12000
 sintel_trailer_2k_1920x1080_24.yuv, --preset medium --hist-scenecut --hist-threshold 0.02
 sintel_trailer_2k_1920x1080_24.yuv, --preset ultrafast --hist-scenecut --hist-threshold 0.02
-
+crowd_run_1920x1080_50.yuv, --preset faster --ctu 32 --rskip 2 --rskip-edge-threshold 5
+crowd_run_1920x1080_50.yuv, --preset fast --ctu 64 --rskip 2 --rskip-edge-threshold 5 --aq-mode 4
+crowd_run_1920x1080_50.yuv, --preset slow --ctu 32 --rskip 2 --rskip-edge-threshold 5 --hist-scenecut --hist-threshold 0.1
+crowd_run_1920x1080_50.yuv, --preset slower --ctu 16 --rskip 2 --rskip-edge-threshold 5 --hist-scenecut --hist-threshold 0.1 --aq-mode 4
+ 
 # Main12 intraCost overflow bug test
 720p50_parkrun_ter.y4m,--preset medium
 
​

x265_3.3.tar.gz/source/test/save-load-tests.txt -> x265_3.4.tar.gz/source/test/save-load-tests.txt Changed

@@ -18,3 +18,4 @@
 RaceHorses_416x240_30.y4m,   --preset slow --no-cutree --ctu 16 --analysis-save x265_analysis.dat --analysis-save-reuse-level 10 --scale-factor 2 --crf 22  --vbv-maxrate 1000 --vbv-bufsize 1000::RaceHorses_832x480_30.y4m,    --preset slow --no-cutree --ctu 32 --analysis-load x265_analysis.dat  --analysis-save x265_analysis_2.dat --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --crf 16 --vbv-maxrate 4000 --vbv-bufsize 4000 --refine-intra 0 --refine-inter 1::RaceHorses_1664x960_30.y4m,   --preset slow --no-cutree --ctu 64 --analysis-load x265_analysis_2.dat  --analysis-load-reuse-level 10 --scale-factor 2 --crf 12 --vbv-maxrate 7000 --vbv-bufsize 7000 --refine-intra 2 --refine-inter 2
 crowd_run_540p50.y4m,   --preset veryslow --no-cutree --analysis-save x265_analysis_540.dat  --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 5000 --vbv-bufsize 15000 --vbv-maxrate 9000::crowd_run_1080p50.y4m,   --preset veryslow --no-cutree --analysis-save x265_analysis_1080.dat  --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500::crowd_run_1080p50.y4m,  --preset veryslow --no-cutree --analysis-save x265_analysis_1080.dat --analysis-load x265_analysis_540.dat --refine-intra 4 --dynamic-refine --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500::crowd_run_2160p50.y4m,  --preset veryslow --no-cutree --analysis-save x265_analysis_2160.dat --analysis-load x265_analysis_1080.dat --refine-intra 3 --dynamic-refine --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000::crowd_run_2160p50.y4m,  --preset veryslow --no-cutree --analysis-load x265_analysis_2160.dat --refine-intra 2 --dynamic-refine --analysis-load-reuse-level 10 --scale-factor 1 --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000
 crowd_run_540p50.y4m,  --preset medium --no-cutree --analysis-save x265_analysis_540.dat  --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 5000 --vbv-bufsize 15000 --vbv-maxrate 9000::crowd_run_1080p50.y4m,  --preset medium --no-cutree --analysis-save x265_analysis_1080.dat  --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500::crowd_run_1080p50.y4m,  --preset medium --no-cutree --analysis-save x265_analysis_1080.dat --analysis-load x265_analysis_540.dat --refine-intra 4 --dynamic-refine --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500::crowd_run_2160p50.y4m,  --preset medium --no-cutree --analysis-save x265_analysis_2160.dat --analysis-load x265_analysis_1080.dat --refine-intra 3 --dynamic-refine --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000::crowd_run_2160p50.y4m,  --preset medium --no-cutree --analysis-load x265_analysis_2160.dat --refine-intra 2 --dynamic-refine --analysis-load-reuse-level 10 --scale-factor 1 --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000
+News-4k.y4m,  --preset medium --analysis-save x265_analysis_fdup.dat --frame-dup --hrd --bitrate 10000 --vbv-bufsize 15000 --vbv-maxrate 12000::News-4k.y4m, --analysis-load x265_analysis_fdup.dat --frame-dup --hrd --bitrate 10000 --vbv-bufsize 15000 --vbv-maxrate 12000

 
@@ -18,3 +18,4 @@
 RaceHorses_416x240_30.y4m,   --preset slow --no-cutree --ctu 16 --analysis-save x265_analysis.dat --analysis-save-reuse-level 10 --scale-factor 2 --crf 22  --vbv-maxrate 1000 --vbv-bufsize 1000::RaceHorses_832x480_30.y4m,    --preset slow --no-cutree --ctu 32 --analysis-load x265_analysis.dat  --analysis-save x265_analysis_2.dat --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --crf 16 --vbv-maxrate 4000 --vbv-bufsize 4000 --refine-intra 0 --refine-inter 1::RaceHorses_1664x960_30.y4m,   --preset slow --no-cutree --ctu 64 --analysis-load x265_analysis_2.dat  --analysis-load-reuse-level 10 --scale-factor 2 --crf 12 --vbv-maxrate 7000 --vbv-bufsize 7000 --refine-intra 2 --refine-inter 2
 crowd_run_540p50.y4m,   --preset veryslow --no-cutree --analysis-save x265_analysis_540.dat  --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 5000 --vbv-bufsize 15000 --vbv-maxrate 9000::crowd_run_1080p50.y4m,   --preset veryslow --no-cutree --analysis-save x265_analysis_1080.dat  --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500::crowd_run_1080p50.y4m,  --preset veryslow --no-cutree --analysis-save x265_analysis_1080.dat --analysis-load x265_analysis_540.dat --refine-intra 4 --dynamic-refine --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500::crowd_run_2160p50.y4m,  --preset veryslow --no-cutree --analysis-save x265_analysis_2160.dat --analysis-load x265_analysis_1080.dat --refine-intra 3 --dynamic-refine --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000::crowd_run_2160p50.y4m,  --preset veryslow --no-cutree --analysis-load x265_analysis_2160.dat --refine-intra 2 --dynamic-refine --analysis-load-reuse-level 10 --scale-factor 1 --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000
 crowd_run_540p50.y4m,  --preset medium --no-cutree --analysis-save x265_analysis_540.dat  --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 5000 --vbv-bufsize 15000 --vbv-maxrate 9000::crowd_run_1080p50.y4m,  --preset medium --no-cutree --analysis-save x265_analysis_1080.dat  --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500::crowd_run_1080p50.y4m,  --preset medium --no-cutree --analysis-save x265_analysis_1080.dat --analysis-load x265_analysis_540.dat --refine-intra 4 --dynamic-refine --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500::crowd_run_2160p50.y4m,  --preset medium --no-cutree --analysis-save x265_analysis_2160.dat --analysis-load x265_analysis_1080.dat --refine-intra 3 --dynamic-refine --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000::crowd_run_2160p50.y4m,  --preset medium --no-cutree --analysis-load x265_analysis_2160.dat --refine-intra 2 --dynamic-refine --analysis-load-reuse-level 10 --scale-factor 1 --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000
+News-4k.y4m,  --preset medium --analysis-save x265_analysis_fdup.dat --frame-dup --hrd --bitrate 10000 --vbv-bufsize 15000 --vbv-maxrate 12000::News-4k.y4m, --analysis-load x265_analysis_fdup.dat --frame-dup --hrd --bitrate 10000 --vbv-bufsize 15000 --vbv-maxrate 12000
​

x265_3.3.tar.gz/source/test/testbench.cpp -> x265_3.4.tar.gz/source/test/testbench.cpp Changed

@@ -5,6 +5,7 @@
  *          Mandar Gurav <mandar@multicorewareinc.com>
  *          Mahesh Pittala <mahesh@multicorewareinc.com>
  *          Min Chen <chenm003@163.com>
+ *          Yimeng Su <yimeng.su@huawei.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -208,6 +209,14 @@
         EncoderPrimitives asmprim;
         memset(&asmprim, 0, sizeof(asmprim));
         setupAssemblyPrimitives(asmprim, test_arch[i].flag);
+
+#if X265_ARCH_ARM64
+        /* Temporary workaround because luma_vsp assembly primitive has not been completed
+         * but interp_8tap_hv_pp_cpu uses mixed C primitive and assembly primitive.
+         * Otherwise, segment fault occurs. */
+        setupAliasCPrimitives(cprim, asmprim, test_arch[i].flag);
+#endif
+
         setupAliasPrimitives(asmprim);
         memcpy(&primitives, &asmprim, sizeof(EncoderPrimitives));
         for (size_t h = 0; h < sizeof(harness) / sizeof(TestHarness*); h++)
@@ -232,6 +241,13 @@
 #endif
     setupAssemblyPrimitives(optprim, cpuid);
 
+#if X265_ARCH_ARM64
+    /* Temporary workaround because luma_vsp assembly primitive has not been completed
+     * but interp_8tap_hv_pp_cpu uses mixed C primitive and assembly primitive.
+     * Otherwise, segment fault occurs. */
+    setupAliasCPrimitives(cprim, optprim, cpuid);
+#endif
+
     /* Note that we do not setup aliases for performance tests, that would be
      * redundant. The testbench only verifies they are correctly aliased */

 
@@ -5,6 +5,7 @@
  *          Mandar Gurav <mandar@multicorewareinc.com>
  *          Mahesh Pittala <mahesh@multicorewareinc.com>
  *          Min Chen <chenm003@163.com>
+ *          Yimeng Su <yimeng.su@huawei.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -208,6 +209,14 @@
         EncoderPrimitives asmprim;
         memset(&asmprim, 0, sizeof(asmprim));
         setupAssemblyPrimitives(asmprim, test_arch[i].flag);
+
+#if X265_ARCH_ARM64
+        /* Temporary workaround because luma_vsp assembly primitive has not been completed
+         * but interp_8tap_hv_pp_cpu uses mixed C primitive and assembly primitive.
+         * Otherwise, segment fault occurs. */
+        setupAliasCPrimitives(cprim, asmprim, test_arch[i].flag);
+#endif
+
         setupAliasPrimitives(asmprim);
         memcpy(&primitives, &asmprim, sizeof(EncoderPrimitives));
         for (size_t h = 0; h < sizeof(harness) / sizeof(TestHarness*); h++)
@@ -232,6 +241,13 @@
 #endif
     setupAssemblyPrimitives(optprim, cpuid);
 
+#if X265_ARCH_ARM64
+    /* Temporary workaround because luma_vsp assembly primitive has not been completed
+     * but interp_8tap_hv_pp_cpu uses mixed C primitive and assembly primitive.
+     * Otherwise, segment fault occurs. */
+    setupAliasCPrimitives(cprim, optprim, cpuid);
+#endif
+
     /* Note that we do not setup aliases for performance tests, that would be
      * redundant. The testbench only verifies they are correctly aliased */
 
​

x265_3.3.tar.gz/source/test/testharness.h -> x265_3.4.tar.gz/source/test/testharness.h Changed

 
@@ -3,6 +3,7 @@
  *
  * Authors: Steve Borho <steve@borho.org>
  *          Min Chen <chenm003@163.com>
+ *          Yimeng Su <yimeng.su@huawei.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -81,12 +82,16 @@
 #if X265_ARCH_X86
     asm volatile("rdtsc" : "=a" (a) ::"edx");
 #elif X265_ARCH_ARM
+#if X265_ARCH_ARM64
+    asm volatile("mrs %0, cntvct_el0" : "=r"(a));
+#else
     // TOD-DO: verify following inline asm to get cpu Timestamp Counter for ARM arch
     // asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(a));
 
     // TO-DO: replace clock() function with appropriate ARM cpu instructions
     a = clock();
 #endif
+#endif
     return a;
 }
 #endif // ifdef _MSC_VER
​

x265_3.3.tar.gz/source/x265.cpp -> x265_3.4.tar.gz/source/x265.cpp Changed

@@ -27,11 +27,7 @@
 
 #include "x265.h"
 #include "x265cli.h"
-
-#include "input/input.h"
-#include "output/output.h"
-#include "output/reconplay.h"
-#include "svt.h"
+#include "abrEncApp.h"
 
 #if HAVE_VLD
 /* Visual Leak Detector */
@@ -47,191 +43,59 @@
 #include <fstream>
 #include <queue>
 
-#define CONSOLE_TITLE_SIZE 200
-#ifdef _WIN32
-#include <windows.h>
-#define SetThreadExecutionState(es)
-static char orgConsoleTitle[CONSOLE_TITLE_SIZE] = "";
-#else
-#define GetConsoleTitle(t, n)
-#define SetConsoleTitle(t)
-#define SetThreadExecutionState(es)
-#endif
-
 using namespace X265_NS;
 
-/* Ctrl-C handler */
-static volatile sig_atomic_t b_ctrl_c /* = 0 */;
-static void sigint_handler(int)
-{
-    b_ctrl_c = 1;
-}
-#define START_CODE 0x00000001
-#define START_CODE_BYTES 4
-
-struct CLIOptions
-{
-    InputFile* input;
-    ReconFile* recon;
-    OutputFile* output;
-    FILE*       qpfile;
-    FILE*       zoneFile;
-    FILE*    dolbyVisionRpu;    /* File containing Dolby Vision BL RPU metadata */
-    const char* reconPlayCmd;
-    const x265_api* api;
-    x265_param* param;
-    x265_vmaf_data* vmafData;
-    bool bProgress;
-    bool bForceY4m;
-    bool bDither;
-    uint32_t seek;              // number of frames to skip from the beginning
-    uint32_t framesToBeEncoded; // number of frames to encode
-    uint64_t totalbytes;
-    int64_t startTime;
-    int64_t prevUpdateTime;
-
-    /* in microseconds */
-    static const int UPDATE_INTERVAL = 250000;
-
-    CLIOptions()
-    {
-        input = NULL;
-        recon = NULL;
-        output = NULL;
-        qpfile = NULL;
-        zoneFile = NULL;
-        dolbyVisionRpu = NULL;
-        reconPlayCmd = NULL;
-        api = NULL;
-        param = NULL;
-        vmafData = NULL;
-        framesToBeEncoded = seek = 0;
-        totalbytes = 0;
-        bProgress = true;
-        bForceY4m = false;
-        startTime = x265_mdate();
-        prevUpdateTime = 0;
-        bDither = false;
-    }
+#define X265_HEAD_ENTRIES 3
 
-    void destroy();
-    void printStatus(uint32_t frameNum);
-    bool parse(int argc, char **argv);
-    bool parseZoneParam(int argc, char **argv, x265_param* globalParam, int zonefileCount);
-    bool parseQPFile(x265_picture &pic_org);
-    bool parseZoneFile();
-};
-
-void CLIOptions::destroy()
-{
-    if (input)
-        input->release();
-    input = NULL;
-    if (recon)
-        recon->release();
-    recon = NULL;
-    if (qpfile)
-        fclose(qpfile);
-    qpfile = NULL;
-    if (zoneFile)
-        fclose(zoneFile);
-    zoneFile = NULL;
-    if (dolbyVisionRpu)
-        fclose(dolbyVisionRpu);
-    dolbyVisionRpu = NULL;
-    if (output)
-        output->release();
-    output = NULL;
-}
-
-void CLIOptions::printStatus(uint32_t frameNum)
-{
-    char buf[200];
-    int64_t time = x265_mdate();
-
-    if (!bProgress || !frameNum || (prevUpdateTime && time - prevUpdateTime < UPDATE_INTERVAL))
-        return;
-
-    int64_t elapsed = time - startTime;
-    double fps = elapsed > 0 ? frameNum * 1000000. / elapsed : 0;
-    float bitrate = 0.008f * totalbytes * (param->fpsNum / param->fpsDenom) / ((float)frameNum);
-    if (framesToBeEncoded)
-    {
-        int eta = (int)(elapsed * (framesToBeEncoded - frameNum) / ((int64_t)frameNum * 1000000));
-        sprintf(buf, "x265 [%.1f%%] %d/%d frames, %.2f fps, %.2f kb/s, eta %d:%02d:%02d",
-            100. * frameNum / (param->chunkEnd ? param->chunkEnd : param->totalFrames), frameNum, (param->chunkEnd ? param->chunkEnd : param->totalFrames), fps, bitrate,
-                eta / 3600, (eta / 60) % 60, eta % 60);
-    }
-    else
-        sprintf(buf, "x265 %d frames: %.2f fps, %.2f kb/s", frameNum, fps, bitrate);
-
-    fprintf(stderr, "%s  \r", buf + 5);
-    SetConsoleTitle(buf);
-    fflush(stderr); // needed in windows
-    prevUpdateTime = time;
-}
+#ifdef _WIN32
+#define strdup _strdup
+#endif
 
-bool CLIOptions::parseZoneParam(int argc, char **argv, x265_param* globalParam, int zonefileCount)
+#ifdef _WIN32
+/* Copy of x264 code, which allows for Unicode characters in the command line.
+ * Retrieve command line arguments as UTF-8. */
+static int get_argv_utf8(int *argc_ptr, char ***argv_ptr)
 {
-    bool bError = false;
-    int bShowHelp = false;
-    int outputBitDepth = 0;
-    const char *profile = NULL;
-
-    /* Presets are applied before all other options. */
-    for (optind = 0;;)
-    {
-        int c = getopt_long(argc, argv, short_options, long_options, NULL);
-        if (c == -1)
-            break;
-        else if (c == 'D')
-            outputBitDepth = atoi(optarg);
-        else if (c == 'P')
-            profile = optarg;
-        else if (c == '?')
-            bShowHelp = true;
-    }
-
-    if (!outputBitDepth && profile)
-    {
-        /* try to derive the output bit depth from the requested profile */
-        if (strstr(profile, "10"))
-            outputBitDepth = 10;
-        else if (strstr(profile, "12"))
-            outputBitDepth = 12;
-        else
-            outputBitDepth = 8;
-    }
-
-    api = x265_api_get(outputBitDepth);
-    if (!api)
+    int ret = 0;
+    wchar_t **argv_utf16 = CommandLineToArgvW(GetCommandLineW(), argc_ptr);
+    if (argv_utf16)
     {
-        x265_log(NULL, X265_LOG_WARNING, "falling back to default bit-depth\n");
-        api = x265_api_get(0);
-    }
+        int argc = *argc_ptr;
+        int offset = (argc + 1) * sizeof(char*);
+        int size = offset;
 
-    if (bShowHelp)
-    {
-        printVersion(globalParam, api);
-        showHelp(globalParam);
-    }
+        for (int i = 0; i < argc; i++)

 
@@ -27,11 +27,7 @@
 
 #include "x265.h"
 #include "x265cli.h"
-
-#include "input/input.h"
-#include "output/output.h"
-#include "output/reconplay.h"
-#include "svt.h"
+#include "abrEncApp.h"
 
 #if HAVE_VLD
 /* Visual Leak Detector */
@@ -47,191 +43,59 @@
 #include <fstream>
 #include <queue>
 
-#define CONSOLE_TITLE_SIZE 200
-#ifdef _WIN32
-#include <windows.h>
-#define SetThreadExecutionState(es)
-static char orgConsoleTitle[CONSOLE_TITLE_SIZE] = "";
-#else
-#define GetConsoleTitle(t, n)
-#define SetConsoleTitle(t)
-#define SetThreadExecutionState(es)
-#endif
-
 using namespace X265_NS;
 
-/* Ctrl-C handler */
-static volatile sig_atomic_t b_ctrl_c /* = 0 */;
-static void sigint_handler(int)
-{
-    b_ctrl_c = 1;
-}
-#define START_CODE 0x00000001
-#define START_CODE_BYTES 4
-
-struct CLIOptions
-{
-    InputFile* input;
-    ReconFile* recon;
-    OutputFile* output;
-    FILE*       qpfile;
-    FILE*       zoneFile;
-    FILE*    dolbyVisionRpu;    /* File containing Dolby Vision BL RPU metadata */
-    const char* reconPlayCmd;
-    const x265_api* api;
-    x265_param* param;
-    x265_vmaf_data* vmafData;
-    bool bProgress;
-    bool bForceY4m;
-    bool bDither;
-    uint32_t seek;              // number of frames to skip from the beginning
-    uint32_t framesToBeEncoded; // number of frames to encode
-    uint64_t totalbytes;
-    int64_t startTime;
-    int64_t prevUpdateTime;
-
-    /* in microseconds */
-    static const int UPDATE_INTERVAL = 250000;
-
-    CLIOptions()
-    {
-        input = NULL;
-        recon = NULL;
-        output = NULL;
-        qpfile = NULL;
-        zoneFile = NULL;
-        dolbyVisionRpu = NULL;
-        reconPlayCmd = NULL;
-        api = NULL;
-        param = NULL;
-        vmafData = NULL;
-        framesToBeEncoded = seek = 0;
-        totalbytes = 0;
-        bProgress = true;
-        bForceY4m = false;
-        startTime = x265_mdate();
-        prevUpdateTime = 0;
-        bDither = false;
-    }
+#define X265_HEAD_ENTRIES 3
 
-    void destroy();
-    void printStatus(uint32_t frameNum);
-    bool parse(int argc, char **argv);
-    bool parseZoneParam(int argc, char **argv, x265_param* globalParam, int zonefileCount);
-    bool parseQPFile(x265_picture &pic_org);
-    bool parseZoneFile();
-};
-
-void CLIOptions::destroy()
-{
-    if (input)
-        input->release();
-    input = NULL;
-    if (recon)
-        recon->release();
-    recon = NULL;
-    if (qpfile)
-        fclose(qpfile);
-    qpfile = NULL;
-    if (zoneFile)
-        fclose(zoneFile);
-    zoneFile = NULL;
-    if (dolbyVisionRpu)
-        fclose(dolbyVisionRpu);
-    dolbyVisionRpu = NULL;
-    if (output)
-        output->release();
-    output = NULL;
-}
-
-void CLIOptions::printStatus(uint32_t frameNum)
-{
-    char buf[200];
-    int64_t time = x265_mdate();
-
-    if (!bProgress || !frameNum || (prevUpdateTime && time - prevUpdateTime < UPDATE_INTERVAL))
-        return;
-
-    int64_t elapsed = time - startTime;
-    double fps = elapsed > 0 ? frameNum * 1000000. / elapsed : 0;
-    float bitrate = 0.008f * totalbytes * (param->fpsNum / param->fpsDenom) / ((float)frameNum);
-    if (framesToBeEncoded)
-    {
-        int eta = (int)(elapsed * (framesToBeEncoded - frameNum) / ((int64_t)frameNum * 1000000));
-        sprintf(buf, "x265 [%.1f%%] %d/%d frames, %.2f fps, %.2f kb/s, eta %d:%02d:%02d",
-            100. * frameNum / (param->chunkEnd ? param->chunkEnd : param->totalFrames), frameNum, (param->chunkEnd ? param->chunkEnd : param->totalFrames), fps, bitrate,
-                eta / 3600, (eta / 60) % 60, eta % 60);
-    }
-    else
-        sprintf(buf, "x265 %d frames: %.2f fps, %.2f kb/s", frameNum, fps, bitrate);
-
-    fprintf(stderr, "%s  \r", buf + 5);
-    SetConsoleTitle(buf);
-    fflush(stderr); // needed in windows
-    prevUpdateTime = time;
-}
+#ifdef _WIN32
+#define strdup _strdup
+#endif
 
-bool CLIOptions::parseZoneParam(int argc, char **argv, x265_param* globalParam, int zonefileCount)
+#ifdef _WIN32
+/* Copy of x264 code, which allows for Unicode characters in the command line.
+ * Retrieve command line arguments as UTF-8. */
+static int get_argv_utf8(int *argc_ptr, char ***argv_ptr)
 {
-    bool bError = false;
-    int bShowHelp = false;
-    int outputBitDepth = 0;
-    const char *profile = NULL;
-
-    /* Presets are applied before all other options. */
-    for (optind = 0;;)
-    {
-        int c = getopt_long(argc, argv, short_options, long_options, NULL);
-        if (c == -1)
-            break;
-        else if (c == 'D')
-            outputBitDepth = atoi(optarg);
-        else if (c == 'P')
-            profile = optarg;
-        else if (c == '?')
-            bShowHelp = true;
-    }
-
-    if (!outputBitDepth && profile)
-    {
-        /* try to derive the output bit depth from the requested profile */
-        if (strstr(profile, "10"))
-            outputBitDepth = 10;
-        else if (strstr(profile, "12"))
-            outputBitDepth = 12;
-        else
-            outputBitDepth = 8;
-    }
-
-    api = x265_api_get(outputBitDepth);
-    if (!api)
+    int ret = 0;
+    wchar_t **argv_utf16 = CommandLineToArgvW(GetCommandLineW(), argc_ptr);
+    if (argv_utf16)
     {
-        x265_log(NULL, X265_LOG_WARNING, "falling back to default bit-depth\n");
-        api = x265_api_get(0);
-    }
+        int argc = *argc_ptr;
+        int offset = (argc + 1) * sizeof(char*);
+        int size = offset;
 
-    if (bShowHelp)
-    {
-        printVersion(globalParam, api);
-        showHelp(globalParam);
-    }
+        for (int i = 0; i < argc; i++)
​

x265_3.3.tar.gz/source/x265.h -> x265_3.4.tar.gz/source/x265.h Changed

@@ -134,6 +134,7 @@
     int     ctuDistortionRefine;
     int     rightOffset;
     int     bottomOffset;
+    int     frameDuplication;
 }x265_analysis_validate;
 
 /* Stores intra analysis data for a single frame. This struct needs better packing */
@@ -304,6 +305,7 @@
     double           totalFrameTime;
     double           vmafFrameScore;
     double           bufferFillFinal;
+    double           unclippedBufferFillFinal;
 } x265_frame_stats;
 
 typedef struct x265_ctu_info_t
@@ -1255,9 +1257,9 @@
      * skip blocks. Default is disabled */
     int       bEnableEarlySkip;
 
-    /* Enable early CU size decisions to avoid recursing to higher depths. 
+    /* Enable early CU size decisions to avoid recursing to higher depths.
      * Default is enabled */
-    int bEnableRecursionSkip;
+    int       recursionSkipMode;
 
     /* Use a faster search method to find the best intra mode. Default is 0 */
     int       bEnableFastIntra;
@@ -1857,7 +1859,7 @@
     double    edgeTransitionThreshold;
 
     /* Enables histogram based scenecut detection algorithm to detect scenecuts. Default disabled */
-    int      bHistBasedSceneCut;
+    int       bHistBasedSceneCut;
 
     /* Enable HME search ranges for L0, L1 and L2 respectively. */
     int       hmeRange[3];
@@ -1874,7 +1876,7 @@
     * analysis information stored in analysis-save. Higher the refine level higher
     * the information stored. Default is 5 */
     int       analysisSaveReuseLevel;
-    
+
     /* A value between 1 and 10 (both inclusive) determines the level of
     * analysis information reused in analysis-load. Higher the refine level higher
     * the information reused. Default is 5 */
@@ -1901,6 +1903,12 @@
     * info is available from the corresponding analysis-save. */
 
     int      confWinBottomOffset;
+
+    /* Edge variance threshold for quad tree establishment. */
+    float    edgeVarThreshold;
+
+    /* Maxrate that could be signaled to the decoder. Default 0. API only. */
+    int      decoderVbvMaxRate;
 } x265_param;
 
 /* x265_param_alloc:

 
@@ -134,6 +134,7 @@
     int     ctuDistortionRefine;
     int     rightOffset;
     int     bottomOffset;
+    int     frameDuplication;
 }x265_analysis_validate;
 
 /* Stores intra analysis data for a single frame. This struct needs better packing */
@@ -304,6 +305,7 @@
     double           totalFrameTime;
     double           vmafFrameScore;
     double           bufferFillFinal;
+    double           unclippedBufferFillFinal;
 } x265_frame_stats;
 
 typedef struct x265_ctu_info_t
@@ -1255,9 +1257,9 @@
      * skip blocks. Default is disabled */
     int       bEnableEarlySkip;
 
-    /* Enable early CU size decisions to avoid recursing to higher depths. 
+    /* Enable early CU size decisions to avoid recursing to higher depths.
      * Default is enabled */
-    int bEnableRecursionSkip;
+    int       recursionSkipMode;
 
     /* Use a faster search method to find the best intra mode. Default is 0 */
     int       bEnableFastIntra;
@@ -1857,7 +1859,7 @@
     double    edgeTransitionThreshold;
 
     /* Enables histogram based scenecut detection algorithm to detect scenecuts. Default disabled */
-    int      bHistBasedSceneCut;
+    int       bHistBasedSceneCut;
 
     /* Enable HME search ranges for L0, L1 and L2 respectively. */
     int       hmeRange[3];
@@ -1874,7 +1876,7 @@
     * analysis information stored in analysis-save. Higher the refine level higher
     * the information stored. Default is 5 */
     int       analysisSaveReuseLevel;
-    
+
     /* A value between 1 and 10 (both inclusive) determines the level of
     * analysis information reused in analysis-load. Higher the refine level higher
     * the information reused. Default is 5 */
@@ -1901,6 +1903,12 @@
     * info is available from the corresponding analysis-save. */
 
     int      confWinBottomOffset;
+
+    /* Edge variance threshold for quad tree establishment. */
+    float    edgeVarThreshold;
+
+    /* Maxrate that could be signaled to the decoder. Default 0. API only. */
+    int      decoderVbvMaxRate;
 } x265_param;
 
 /* x265_param_alloc:
​

x265_3.4.tar.gz/source/x265cli.cpp Added

@@ -0,0 +1,1062 @@
+/*****************************************************************************
+ * Copyright (C) 2013-2020 MulticoreWare, Inc
+ *
+ * Authors: Steve Borho <steve@borho.org>
+ *          Min Chen <chenm003@163.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+#if _MSC_VER
+#pragma warning(disable: 4127) // conditional expression is constant, yes I know
+#endif
+
+#include "x265cli.h"
+#include "svt.h"
+
+#define START_CODE 0x00000001
+#define START_CODE_BYTES 4
+
+#ifdef __cplusplus
+namespace X265_NS {
+#endif
+
+    static void printVersion(x265_param *param, const x265_api* api)
+    {
+        x265_log(param, X265_LOG_INFO, "HEVC encoder version %s\n", api->version_str);
+        x265_log(param, X265_LOG_INFO, "build info %s\n", api->build_info_str);
+    }
+
+    static void showHelp(x265_param *param)
+    {
+        int level = param->logLevel;
+
+#define OPT(value) (value ? "enabled" : "disabled")
+#define H0 printf
+#define H1 if (level >= X265_LOG_DEBUG) printf
+
+        H0("\nSyntax: x265 [options] infile [-o] outfile\n");
+        H0("    infile can be YUV or Y4M\n");
+        H0("    outfile is raw HEVC bitstream\n");
+        H0("\nExecutable Options:\n");
+        H0("-h/--help                        Show this help text and exit\n");
+        H0("   --fullhelp                    Show all options and exit\n");
+        H0("-V/--version                     Show version info and exit\n");
+        H0("\nOutput Options:\n");
+        H0("-o/--output <filename>           Bitstream output file name\n");
+        H0("-D/--output-depth 8|10|12        Output bit depth (also internal bit depth). Default %d\n", param->internalBitDepth);
+        H0("   --log-level <string>          Logging level: none error warning info debug full. Default %s\n", X265_NS::logLevelNames[param->logLevel + 1]);
+        H0("   --no-progress                 Disable CLI progress reports\n");
+        H0("   --csv <filename>              Comma separated log file, if csv-log-level > 0 frame level statistics, else one line per run\n");
+        H0("   --csv-log-level <integer>     Level of csv logging, if csv-log-level > 0 frame level statistics, else one line per run: 0-2\n");
+        H0("\nInput Options:\n");
+        H0("   --input <filename>            Raw YUV or Y4M input file name. `-` for stdin\n");
+        H1("   --y4m                         Force parsing of input stream as YUV4MPEG2 regardless of file extension\n");
+        H0("   --fps <float|rational>        Source frame rate (float or num/denom), auto-detected if Y4M\n");
+        H0("   --input-res WxH               Source picture size [w x h], auto-detected if Y4M\n");
+        H1("   --input-depth <integer>       Bit-depth of input file. Default 8\n");
+        H1("   --input-csp <string>          Chroma subsampling, auto-detected if Y4M\n");
+        H1("                                 0 - i400 (4:0:0 monochrome)\n");
+        H1("                                 1 - i420 (4:2:0 default)\n");
+        H1("                                 2 - i422 (4:2:2)\n");
+        H1("                                 3 - i444 (4:4:4)\n");
+#if ENABLE_HDR10_PLUS
+        H0("   --dhdr10-info <filename>      JSON file containing the Creative Intent Metadata to be encoded as Dynamic Tone Mapping\n");
+        H0("   --[no-]dhdr10-opt             Insert tone mapping SEI only for IDR frames and when the tone mapping information changes. Default disabled\n");
+#endif
+        H0("   --dolby-vision-profile <float|integer> Specifies Dolby Vision profile ID. Currently only profile 5, profile 8.1 and profile 8.2 enabled. Specified as '5' or '50'. Default 0 (disabled).\n");
+        H0("   --dolby-vision-rpu <filename> File containing Dolby Vision RPU metadata.\n"
+            "                                 If given, x265's Dolby Vision metadata parser will fill the RPU field of input pictures with the metadata read from the file. Default NULL(disabled).\n");
+        H0("   --nalu-file <filename>        Text file containing SEI messages in the following format : <POC><space><PREFIX><space><NAL UNIT TYPE>/<SEI TYPE><space><SEI Payload>\n");
+        H0("-f/--frames <integer>            Maximum number of frames to encode. Default all\n");
+        H0("   --seek <integer>              First frame to encode\n");
+        H1("   --[no-]interlace <bff|tff>    Indicate input pictures are interlace fields in temporal order. Default progressive\n");
+        H0("   --[no-]field                  Enable or disable field coding. Default %s\n", OPT(param->bField));
+        H1("   --dither                      Enable dither if downscaling to 8 bit pixels. Default disabled\n");
+        H0("   --[no-]copy-pic               Copy buffers of input picture in frame. Default %s\n", OPT(param->bCopyPicToFrame));
+        H0("\nQuality reporting metrics:\n");
+        H0("   --[no-]ssim                   Enable reporting SSIM metric scores. Default %s\n", OPT(param->bEnableSsim));
+        H0("   --[no-]psnr                   Enable reporting PSNR metric scores. Default %s\n", OPT(param->bEnablePsnr));
+        H0("\nProfile, Level, Tier:\n");
+        H0("-P/--profile <string>            Enforce an encode profile: main, main10, mainstillpicture\n");
+        H0("   --level-idc <integer|float>   Force a minimum required decoder level (as '5.0' or '50')\n");
+        H0("   --[no-]high-tier              If a decoder level is specified, this modifier selects High tier of that level\n");
+        H0("   --uhd-bd                      Enable UHD Bluray compatibility support\n");
+        H0("   --[no-]allow-non-conformance  Allow the encoder to generate profile NONE bitstreams. Default %s\n", OPT(param->bAllowNonConformance));
+        H0("\nThreading, performance:\n");
+        H0("   --pools <integer,...>         Comma separated thread count per thread pool (pool per NUMA node)\n");
+        H0("                                 '-' implies no threads on node, '+' implies one thread per core on node\n");
+        H0("-F/--frame-threads <integer>     Number of concurrently encoded frames. 0: auto-determined by core count\n");
+        H0("   --[no-]wpp                    Enable Wavefront Parallel Processing. Default %s\n", OPT(param->bEnableWavefront));
+        H0("   --[no-]slices <integer>       Enable Multiple Slices feature. Default %d\n", param->maxSlices);
+        H0("   --[no-]pmode                  Parallel mode analysis. Default %s\n", OPT(param->bDistributeModeAnalysis));
+        H0("   --[no-]pme                    Parallel motion estimation. Default %s\n", OPT(param->bDistributeMotionEstimation));
+        H0("   --[no-]asm <bool|int|string>  Override CPU detection. Default: auto\n");
+        H0("\nPresets:\n");
+        H0("-p/--preset <string>             Trade off performance for compression efficiency. Default medium\n");
+        H0("                                 ultrafast, superfast, veryfast, faster, fast, medium, slow, slower, veryslow, or placebo\n");
+        H0("-t/--tune <string>               Tune the settings for a particular type of source or situation:\n");
+        H0("                                 psnr, ssim, grain, zerolatency, fastdecode\n");
+        H0("\nQuad-Tree size and depth:\n");
+        H0("-s/--ctu <64|32|16>              Maximum CU size (WxH). Default %d\n", param->maxCUSize);
+        H0("   --min-cu-size <64|32|16|8>    Minimum CU size (WxH). Default %d\n", param->minCUSize);
+        H0("   --max-tu-size <32|16|8|4>     Maximum TU size (WxH). Default %d\n", param->maxTUSize);
+        H0("   --tu-intra-depth <integer>    Max TU recursive depth for intra CUs. Default %d\n", param->tuQTMaxIntraDepth);
+        H0("   --tu-inter-depth <integer>    Max TU recursive depth for inter CUs. Default %d\n", param->tuQTMaxInterDepth);
+        H0("   --limit-tu <0..4>             Enable early exit from TU recursion for inter coded blocks. Default %d\n", param->limitTU);
+        H0("\nAnalysis:\n");
+        H0("   --rd <1..6>                   Level of RDO in mode decision 1:least....6:full RDO. Default %d\n", param->rdLevel);
+        H0("   --[no-]psy-rd <0..5.0>        Strength of psycho-visual rate distortion optimization, 0 to disable. Default %.1f\n", param->psyRd);
+        H0("   --[no-]rdoq-level <0|1|2>     Level of RDO in quantization 0:none, 1:levels, 2:levels & coding groups. Default %d\n", param->rdoqLevel);
+        H0("   --[no-]psy-rdoq <0..50.0>     Strength of psycho-visual optimization in RDO quantization, 0 to disable. Default %.1f\n", param->psyRdoq);
+        H0("   --dynamic-rd <0..4.0>         Strength of dynamic RD, 0 to disable. Default %.2f\n", param->dynamicRd);
+        H0("   --[no-]ssim-rd                Enable ssim rate distortion optimization, 0 to disable. Default %s\n", OPT(param->bSsimRd));
+        H0("   --[no-]rd-refine              Enable QP based RD refinement for rd levels 5 and 6. Default %s\n", OPT(param->bEnableRdRefine));
+        H0("   --[no-]early-skip             Enable early SKIP detection. Default %s\n", OPT(param->bEnableEarlySkip));
+        H0("   --rskip <mode>                Set mode for early exit from recursion. Mode 1: exit using rdcost & CU homogenity. Mode 2: exit using CU edge density.\n"
+            "                                 Mode 0: disabled. Default %d\n", param->recursionSkipMode);
+        H1("   --rskip-edge-threshold        Threshold in terms of percentage (integer of range [0,100]) for minimum edge density in CUs used to prun the recursion depth. Applicable only for rskip mode 2. Value is preset dependent. Default: %.f\n", param->edgeVarThreshold*100.0f);
+        H1("   --[no-]tskip-fast             Enable fast intra transform skipping. Default %s\n", OPT(param->bEnableTSkipFast));
+        H1("   --[no-]splitrd-skip           Enable skipping split RD analysis when sum of split CU rdCost larger than one split CU rdCost for Intra CU. Default %s\n", OPT(param->bEnableSplitRdSkip));
+        H1("   --nr-intra <integer>          An integer value in range of 0 to 2000, which denotes strength of noise reduction in intra CUs. Default 0\n");
+        H1("   --nr-inter <integer>          An integer value in range of 0 to 2000, which denotes strength of noise reduction in inter CUs. Default 0\n");
+        H0("   --ctu-info <integer>          Enable receiving ctu information asynchronously and determine reaction to the CTU information (0, 1, 2, 4, 6) Default 0\n"
+            "                                    - 1: force the partitions if CTU information is present\n"
+            "                                    - 2: functionality of (1) and reduce qp if CTU information has changed\n"
+            "                                    - 4: functionality of (1) and force Inter modes when CTU Information has changed, merge/skip otherwise\n"
+            "                                    Enable this option only when planning to invoke the API function x265_encoder_ctu_info to copy ctu-info asynchronously\n");
+        H0("\nCoding tools:\n");
+        H0("-w/--[no-]weightp                Enable weighted prediction in P slices. Default %s\n", OPT(param->bEnableWeightedPred));
+        H0("   --[no-]weightb                Enable weighted prediction in B slices. Default %s\n", OPT(param->bEnableWeightedBiPred));
+        H0("   --[no-]cu-lossless            Consider lossless mode in CU RDO decisions. Default %s\n", OPT(param->bCULossless));
+        H0("   --[no-]signhide               Hide sign bit of one coeff per TU (rdo). Default %s\n", OPT(param->bEnableSignHiding));
+        H1("   --[no-]tskip                  Enable intra 4x4 transform skipping. Default %s\n", OPT(param->bEnableTransformSkip));
+        H0("\nTemporal / motion search options:\n");
+        H0("   --max-merge <1..5>            Maximum number of merge candidates. Default %d\n", param->maxNumMergeCand);
+        H0("   --ref <integer>               max number of L0 references to be allowed (1 .. 16) Default %d\n", param->maxNumReferences);
+        H0("   --limit-refs <0|1|2|3>        Limit references per depth (1) or CU (2) or both (3). Default %d\n", param->limitReferences);
+        H0("   --me <string>                 Motion search method dia hex umh star full. Default %d\n", param->searchMethod);
+        H0("-m/--subme <integer>             Amount of subpel refinement to perform (0:least .. 7:most). Default %d \n", param->subpelRefine);
+        H0("   --merange <integer>           Motion search range. Default %d\n", param->searchRange);
+        H0("   --[no-]rect                   Enable rectangular motion partitions Nx2N and 2NxN. Default %s\n", OPT(param->bEnableRectInter));
+        H0("   --[no-]amp                    Enable asymmetric motion partitions, requires --rect. Default %s\n", OPT(param->bEnableAMP));
+        H0("   --[no-]limit-modes            Limit rectangular and asymmetric motion predictions. Default %d\n", param->limitModes);
+        H1("   --[no-]temporal-mvp           Enable temporal MV predictors. Default %s\n", OPT(param->bEnableTemporalMvp));
+        H1("   --[no-]hme                    Enable Hierarchical Motion Estimation. Default %s\n", OPT(param->bEnableHME));
+        H1("   --hme-search <string>         Motion search-method for HME L0,L1 and L2. Default(L0,L1,L2) is %d,%d,%d\n", param->hmeSearchMethod[0], param->hmeSearchMethod[1], param->hmeSearchMethod[2]);
+        H1("   --hme-range <int>,<int>,<int> Motion search-range for HME L0,L1 and L2. Default(L0,L1,L2) is %d,%d,%d\n", param->hmeRange[0], param->hmeRange[1], param->hmeRange[2]);
+        H0("\nSpatial / intra options:\n");
+        H0("   --[no-]strong-intra-smoothing Enable strong intra smoothing for 32x32 blocks. Default %s\n", OPT(param->bEnableStrongIntraSmoothing));
+        H0("   --[no-]constrained-intra      Constrained intra prediction (use only intra coded reference pixels) Default %s\n", OPT(param->bEnableConstrainedIntra));
+        H0("   --[no-]b-intra                Enable intra in B frames in veryslow presets. Default %s\n", OPT(param->bIntraInBFrames));
+        H0("   --[no-]fast-intra             Enable faster search method for angular intra predictions. Default %s\n", OPT(param->bEnableFastIntra));
+        H0("   --rdpenalty <0..2>            penalty for 32x32 intra TU in non-I slices. 0:disabled 1:RD-penalty 2:maximum. Default %d\n", param->rdPenalty);
+        H0("\nSlice decision options:\n");
+        H0("   --[no-]open-gop               Enable open-GOP, allows I slices to be non-IDR. Default %s\n", OPT(param->bOpenGOP));
+        H0("-I/--keyint <integer>            Max IDR period in frames. -1 for infinite-gop. Default %d\n", param->keyframeMax);
+        H0("-i/--min-keyint <integer>        Scenecuts closer together than this are coded as I, not IDR. Default: auto\n");
+        H0("   --gop-lookahead <integer>     Extends gop boundary if a scenecut is found within this from keyint boundary. Default 0\n");
+        H0("   --no-scenecut                 Disable adaptive I-frame decision\n");
+        H0("   --scenecut <integer>          How aggressively to insert extra I-frames. Default %d\n", param->scenecutThreshold);
+        H1("   --scenecut-bias <0..100.0>    Bias for scenecut detection. Default %.2f\n", param->scenecutBias);
+        H0("   --hist-scenecut               Enables histogram based scene-cut detection using histogram based algorithm.\n");
+        H0("   --no-hist-scenecut            Disables histogram based scene-cut detection using histogram based algorithm.\n");
+        H1("   --hist-threshold <0.0..2.0>   Luma Edge histogram's Normalized SAD threshold for histogram based scenecut detection Default %.2f\n", param->edgeTransitionThreshold);
+        H0("   --[no-]fades                  Enable detection and handling of fade-in regions. Default %s\n", OPT(param->bEnableFades));
+        H1("   --[no-]scenecut-aware-qp      Enable increasing QP for frames inside the scenecut window after scenecut. Default %s\n", OPT(param->bEnableSceneCutAwareQp));
+        H1("   --scenecut-window <0..1000>   QP incremental duration(in milliseconds) when scenecut-aware-qp is enabled. Default %d\n", param->scenecutWindow);
+        H1("   --max-qp-delta <0..10>        QP offset to increment with base QP for inter-frames. Default %d\n", param->maxQpDelta);
+        H0("   --radl <integer>              Number of RADL pictures allowed in front of IDR. Default %d\n", param->radl);
+        H0("   --intra-refresh               Use Periodic Intra Refresh instead of IDR frames\n");
+        H0("   --rc-lookahead <integer>      Number of frames for frame-type lookahead (determines encoder latency) Default %d\n", param->lookaheadDepth);
+        H1("   --lookahead-slices <0..16>    Number of slices to use per lookahead cost estimate. Default %d\n", param->lookaheadSlices);
+        H0("   --lookahead-threads <integer> Number of threads to be dedicated to perform lookahead only. Default %d\n", param->lookaheadThreads);
+        H0("-b/--bframes <0..16>             Maximum number of consecutive b-frames. Default %d\n", param->bframes);
+        H1("   --bframe-bias <integer>       Bias towards B frame decisions. Default %d\n", param->bFrameBias);
+        H0("   --b-adapt <0..2>              0 - none, 1 - fast, 2 - full (trellis) adaptive B frame scheduling. Default %d\n", param->bFrameAdaptive);
+        H0("   --[no-]b-pyramid              Use B-frames as references. Default %s\n", OPT(param->bBPyramid));
+        H1("   --qpfile <string>             Force frametypes and QPs for some or all frames\n");
+        H1("                                 Format of each line: framenumber frametype QP\n");
+        H1("                                 QP is optional (none lets x265 choose). Frametypes: I,i,K,P,B,b.\n");
+        H1("                                 QPs are restricted by qpmin/qpmax.\n");
+        H1("   --force-flush <integer>       Force the encoder to flush frames. Default %d\n", param->forceFlush);
+        H1("                                 0 - flush the encoder only when all the input pictures are over.\n");
+        H1("                                 1 - flush all the frames even when the input is not over. Slicetype decision may change with this option.\n");
+        H1("                                 2 - flush the slicetype decided frames only.\n");

 
@@ -0,0 +1,1062 @@
+/*****************************************************************************
+ * Copyright (C) 2013-2020 MulticoreWare, Inc
+ *
+ * Authors: Steve Borho <steve@borho.org>
+ *          Min Chen <chenm003@163.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+#if _MSC_VER
+#pragma warning(disable: 4127) // conditional expression is constant, yes I know
+#endif
+
+#include "x265cli.h"
+#include "svt.h"
+
+#define START_CODE 0x00000001
+#define START_CODE_BYTES 4
+
+#ifdef __cplusplus
+namespace X265_NS {
+#endif
+
+    static void printVersion(x265_param *param, const x265_api* api)
+    {
+        x265_log(param, X265_LOG_INFO, "HEVC encoder version %s\n", api->version_str);
+        x265_log(param, X265_LOG_INFO, "build info %s\n", api->build_info_str);
+    }
+
+    static void showHelp(x265_param *param)
+    {
+        int level = param->logLevel;
+
+#define OPT(value) (value ? "enabled" : "disabled")
+#define H0 printf
+#define H1 if (level >= X265_LOG_DEBUG) printf
+
+        H0("\nSyntax: x265 [options] infile [-o] outfile\n");
+        H0("    infile can be YUV or Y4M\n");
+        H0("    outfile is raw HEVC bitstream\n");
+        H0("\nExecutable Options:\n");
+        H0("-h/--help                        Show this help text and exit\n");
+        H0("   --fullhelp                    Show all options and exit\n");
+        H0("-V/--version                     Show version info and exit\n");
+        H0("\nOutput Options:\n");
+        H0("-o/--output <filename>           Bitstream output file name\n");
+        H0("-D/--output-depth 8|10|12        Output bit depth (also internal bit depth). Default %d\n", param->internalBitDepth);
+        H0("   --log-level <string>          Logging level: none error warning info debug full. Default %s\n", X265_NS::logLevelNames[param->logLevel + 1]);
+        H0("   --no-progress                 Disable CLI progress reports\n");
+        H0("   --csv <filename>              Comma separated log file, if csv-log-level > 0 frame level statistics, else one line per run\n");
+        H0("   --csv-log-level <integer>     Level of csv logging, if csv-log-level > 0 frame level statistics, else one line per run: 0-2\n");
+        H0("\nInput Options:\n");
+        H0("   --input <filename>            Raw YUV or Y4M input file name. `-` for stdin\n");
+        H1("   --y4m                         Force parsing of input stream as YUV4MPEG2 regardless of file extension\n");
+        H0("   --fps <float|rational>        Source frame rate (float or num/denom), auto-detected if Y4M\n");
+        H0("   --input-res WxH               Source picture size [w x h], auto-detected if Y4M\n");
+        H1("   --input-depth <integer>       Bit-depth of input file. Default 8\n");
+        H1("   --input-csp <string>          Chroma subsampling, auto-detected if Y4M\n");
+        H1("                                 0 - i400 (4:0:0 monochrome)\n");
+        H1("                                 1 - i420 (4:2:0 default)\n");
+        H1("                                 2 - i422 (4:2:2)\n");
+        H1("                                 3 - i444 (4:4:4)\n");
+#if ENABLE_HDR10_PLUS
+        H0("   --dhdr10-info <filename>      JSON file containing the Creative Intent Metadata to be encoded as Dynamic Tone Mapping\n");
+        H0("   --[no-]dhdr10-opt             Insert tone mapping SEI only for IDR frames and when the tone mapping information changes. Default disabled\n");
+#endif
+        H0("   --dolby-vision-profile <float|integer> Specifies Dolby Vision profile ID. Currently only profile 5, profile 8.1 and profile 8.2 enabled. Specified as '5' or '50'. Default 0 (disabled).\n");
+        H0("   --dolby-vision-rpu <filename> File containing Dolby Vision RPU metadata.\n"
+            "                                 If given, x265's Dolby Vision metadata parser will fill the RPU field of input pictures with the metadata read from the file. Default NULL(disabled).\n");
+        H0("   --nalu-file <filename>        Text file containing SEI messages in the following format : <POC><space><PREFIX><space><NAL UNIT TYPE>/<SEI TYPE><space><SEI Payload>\n");
+        H0("-f/--frames <integer>            Maximum number of frames to encode. Default all\n");
+        H0("   --seek <integer>              First frame to encode\n");
+        H1("   --[no-]interlace <bff|tff>    Indicate input pictures are interlace fields in temporal order. Default progressive\n");
+        H0("   --[no-]field                  Enable or disable field coding. Default %s\n", OPT(param->bField));
+        H1("   --dither                      Enable dither if downscaling to 8 bit pixels. Default disabled\n");
+        H0("   --[no-]copy-pic               Copy buffers of input picture in frame. Default %s\n", OPT(param->bCopyPicToFrame));
+        H0("\nQuality reporting metrics:\n");
+        H0("   --[no-]ssim                   Enable reporting SSIM metric scores. Default %s\n", OPT(param->bEnableSsim));
+        H0("   --[no-]psnr                   Enable reporting PSNR metric scores. Default %s\n", OPT(param->bEnablePsnr));
+        H0("\nProfile, Level, Tier:\n");
+        H0("-P/--profile <string>            Enforce an encode profile: main, main10, mainstillpicture\n");
+        H0("   --level-idc <integer|float>   Force a minimum required decoder level (as '5.0' or '50')\n");
+        H0("   --[no-]high-tier              If a decoder level is specified, this modifier selects High tier of that level\n");
+        H0("   --uhd-bd                      Enable UHD Bluray compatibility support\n");
+        H0("   --[no-]allow-non-conformance  Allow the encoder to generate profile NONE bitstreams. Default %s\n", OPT(param->bAllowNonConformance));
+        H0("\nThreading, performance:\n");
+        H0("   --pools <integer,...>         Comma separated thread count per thread pool (pool per NUMA node)\n");
+        H0("                                 '-' implies no threads on node, '+' implies one thread per core on node\n");
+        H0("-F/--frame-threads <integer>     Number of concurrently encoded frames. 0: auto-determined by core count\n");
+        H0("   --[no-]wpp                    Enable Wavefront Parallel Processing. Default %s\n", OPT(param->bEnableWavefront));
+        H0("   --[no-]slices <integer>       Enable Multiple Slices feature. Default %d\n", param->maxSlices);
+        H0("   --[no-]pmode                  Parallel mode analysis. Default %s\n", OPT(param->bDistributeModeAnalysis));
+        H0("   --[no-]pme                    Parallel motion estimation. Default %s\n", OPT(param->bDistributeMotionEstimation));
+        H0("   --[no-]asm <bool|int|string>  Override CPU detection. Default: auto\n");
+        H0("\nPresets:\n");
+        H0("-p/--preset <string>             Trade off performance for compression efficiency. Default medium\n");
+        H0("                                 ultrafast, superfast, veryfast, faster, fast, medium, slow, slower, veryslow, or placebo\n");
+        H0("-t/--tune <string>               Tune the settings for a particular type of source or situation:\n");
+        H0("                                 psnr, ssim, grain, zerolatency, fastdecode\n");
+        H0("\nQuad-Tree size and depth:\n");
+        H0("-s/--ctu <64|32|16>              Maximum CU size (WxH). Default %d\n", param->maxCUSize);
+        H0("   --min-cu-size <64|32|16|8>    Minimum CU size (WxH). Default %d\n", param->minCUSize);
+        H0("   --max-tu-size <32|16|8|4>     Maximum TU size (WxH). Default %d\n", param->maxTUSize);
+        H0("   --tu-intra-depth <integer>    Max TU recursive depth for intra CUs. Default %d\n", param->tuQTMaxIntraDepth);
+        H0("   --tu-inter-depth <integer>    Max TU recursive depth for inter CUs. Default %d\n", param->tuQTMaxInterDepth);
+        H0("   --limit-tu <0..4>             Enable early exit from TU recursion for inter coded blocks. Default %d\n", param->limitTU);
+        H0("\nAnalysis:\n");
+        H0("   --rd <1..6>                   Level of RDO in mode decision 1:least....6:full RDO. Default %d\n", param->rdLevel);
+        H0("   --[no-]psy-rd <0..5.0>        Strength of psycho-visual rate distortion optimization, 0 to disable. Default %.1f\n", param->psyRd);
+        H0("   --[no-]rdoq-level <0|1|2>     Level of RDO in quantization 0:none, 1:levels, 2:levels & coding groups. Default %d\n", param->rdoqLevel);
+        H0("   --[no-]psy-rdoq <0..50.0>     Strength of psycho-visual optimization in RDO quantization, 0 to disable. Default %.1f\n", param->psyRdoq);
+        H0("   --dynamic-rd <0..4.0>         Strength of dynamic RD, 0 to disable. Default %.2f\n", param->dynamicRd);
+        H0("   --[no-]ssim-rd                Enable ssim rate distortion optimization, 0 to disable. Default %s\n", OPT(param->bSsimRd));
+        H0("   --[no-]rd-refine              Enable QP based RD refinement for rd levels 5 and 6. Default %s\n", OPT(param->bEnableRdRefine));
+        H0("   --[no-]early-skip             Enable early SKIP detection. Default %s\n", OPT(param->bEnableEarlySkip));
+        H0("   --rskip <mode>                Set mode for early exit from recursion. Mode 1: exit using rdcost & CU homogenity. Mode 2: exit using CU edge density.\n"
+            "                                 Mode 0: disabled. Default %d\n", param->recursionSkipMode);
+        H1("   --rskip-edge-threshold        Threshold in terms of percentage (integer of range [0,100]) for minimum edge density in CUs used to prun the recursion depth. Applicable only for rskip mode 2. Value is preset dependent. Default: %.f\n", param->edgeVarThreshold*100.0f);
+        H1("   --[no-]tskip-fast             Enable fast intra transform skipping. Default %s\n", OPT(param->bEnableTSkipFast));
+        H1("   --[no-]splitrd-skip           Enable skipping split RD analysis when sum of split CU rdCost larger than one split CU rdCost for Intra CU. Default %s\n", OPT(param->bEnableSplitRdSkip));
+        H1("   --nr-intra <integer>          An integer value in range of 0 to 2000, which denotes strength of noise reduction in intra CUs. Default 0\n");
+        H1("   --nr-inter <integer>          An integer value in range of 0 to 2000, which denotes strength of noise reduction in inter CUs. Default 0\n");
+        H0("   --ctu-info <integer>          Enable receiving ctu information asynchronously and determine reaction to the CTU information (0, 1, 2, 4, 6) Default 0\n"
+            "                                    - 1: force the partitions if CTU information is present\n"
+            "                                    - 2: functionality of (1) and reduce qp if CTU information has changed\n"
+            "                                    - 4: functionality of (1) and force Inter modes when CTU Information has changed, merge/skip otherwise\n"
+            "                                    Enable this option only when planning to invoke the API function x265_encoder_ctu_info to copy ctu-info asynchronously\n");
+        H0("\nCoding tools:\n");
+        H0("-w/--[no-]weightp                Enable weighted prediction in P slices. Default %s\n", OPT(param->bEnableWeightedPred));
+        H0("   --[no-]weightb                Enable weighted prediction in B slices. Default %s\n", OPT(param->bEnableWeightedBiPred));
+        H0("   --[no-]cu-lossless            Consider lossless mode in CU RDO decisions. Default %s\n", OPT(param->bCULossless));
+        H0("   --[no-]signhide               Hide sign bit of one coeff per TU (rdo). Default %s\n", OPT(param->bEnableSignHiding));
+        H1("   --[no-]tskip                  Enable intra 4x4 transform skipping. Default %s\n", OPT(param->bEnableTransformSkip));
+        H0("\nTemporal / motion search options:\n");
+        H0("   --max-merge <1..5>            Maximum number of merge candidates. Default %d\n", param->maxNumMergeCand);
+        H0("   --ref <integer>               max number of L0 references to be allowed (1 .. 16) Default %d\n", param->maxNumReferences);
+        H0("   --limit-refs <0|1|2|3>        Limit references per depth (1) or CU (2) or both (3). Default %d\n", param->limitReferences);
+        H0("   --me <string>                 Motion search method dia hex umh star full. Default %d\n", param->searchMethod);
+        H0("-m/--subme <integer>             Amount of subpel refinement to perform (0:least .. 7:most). Default %d \n", param->subpelRefine);
+        H0("   --merange <integer>           Motion search range. Default %d\n", param->searchRange);
+        H0("   --[no-]rect                   Enable rectangular motion partitions Nx2N and 2NxN. Default %s\n", OPT(param->bEnableRectInter));
+        H0("   --[no-]amp                    Enable asymmetric motion partitions, requires --rect. Default %s\n", OPT(param->bEnableAMP));
+        H0("   --[no-]limit-modes            Limit rectangular and asymmetric motion predictions. Default %d\n", param->limitModes);
+        H1("   --[no-]temporal-mvp           Enable temporal MV predictors. Default %s\n", OPT(param->bEnableTemporalMvp));
+        H1("   --[no-]hme                    Enable Hierarchical Motion Estimation. Default %s\n", OPT(param->bEnableHME));
+        H1("   --hme-search <string>         Motion search-method for HME L0,L1 and L2. Default(L0,L1,L2) is %d,%d,%d\n", param->hmeSearchMethod[0], param->hmeSearchMethod[1], param->hmeSearchMethod[2]);
+        H1("   --hme-range <int>,<int>,<int> Motion search-range for HME L0,L1 and L2. Default(L0,L1,L2) is %d,%d,%d\n", param->hmeRange[0], param->hmeRange[1], param->hmeRange[2]);
+        H0("\nSpatial / intra options:\n");
+        H0("   --[no-]strong-intra-smoothing Enable strong intra smoothing for 32x32 blocks. Default %s\n", OPT(param->bEnableStrongIntraSmoothing));
+        H0("   --[no-]constrained-intra      Constrained intra prediction (use only intra coded reference pixels) Default %s\n", OPT(param->bEnableConstrainedIntra));
+        H0("   --[no-]b-intra                Enable intra in B frames in veryslow presets. Default %s\n", OPT(param->bIntraInBFrames));
+        H0("   --[no-]fast-intra             Enable faster search method for angular intra predictions. Default %s\n", OPT(param->bEnableFastIntra));
+        H0("   --rdpenalty <0..2>            penalty for 32x32 intra TU in non-I slices. 0:disabled 1:RD-penalty 2:maximum. Default %d\n", param->rdPenalty);
+        H0("\nSlice decision options:\n");
+        H0("   --[no-]open-gop               Enable open-GOP, allows I slices to be non-IDR. Default %s\n", OPT(param->bOpenGOP));
+        H0("-I/--keyint <integer>            Max IDR period in frames. -1 for infinite-gop. Default %d\n", param->keyframeMax);
+        H0("-i/--min-keyint <integer>        Scenecuts closer together than this are coded as I, not IDR. Default: auto\n");
+        H0("   --gop-lookahead <integer>     Extends gop boundary if a scenecut is found within this from keyint boundary. Default 0\n");
+        H0("   --no-scenecut                 Disable adaptive I-frame decision\n");
+        H0("   --scenecut <integer>          How aggressively to insert extra I-frames. Default %d\n", param->scenecutThreshold);
+        H1("   --scenecut-bias <0..100.0>    Bias for scenecut detection. Default %.2f\n", param->scenecutBias);
+        H0("   --hist-scenecut               Enables histogram based scene-cut detection using histogram based algorithm.\n");
+        H0("   --no-hist-scenecut            Disables histogram based scene-cut detection using histogram based algorithm.\n");
+        H1("   --hist-threshold <0.0..2.0>   Luma Edge histogram's Normalized SAD threshold for histogram based scenecut detection Default %.2f\n", param->edgeTransitionThreshold);
+        H0("   --[no-]fades                  Enable detection and handling of fade-in regions. Default %s\n", OPT(param->bEnableFades));
+        H1("   --[no-]scenecut-aware-qp      Enable increasing QP for frames inside the scenecut window after scenecut. Default %s\n", OPT(param->bEnableSceneCutAwareQp));
+        H1("   --scenecut-window <0..1000>   QP incremental duration(in milliseconds) when scenecut-aware-qp is enabled. Default %d\n", param->scenecutWindow);
+        H1("   --max-qp-delta <0..10>        QP offset to increment with base QP for inter-frames. Default %d\n", param->maxQpDelta);
+        H0("   --radl <integer>              Number of RADL pictures allowed in front of IDR. Default %d\n", param->radl);
+        H0("   --intra-refresh               Use Periodic Intra Refresh instead of IDR frames\n");
+        H0("   --rc-lookahead <integer>      Number of frames for frame-type lookahead (determines encoder latency) Default %d\n", param->lookaheadDepth);
+        H1("   --lookahead-slices <0..16>    Number of slices to use per lookahead cost estimate. Default %d\n", param->lookaheadSlices);
+        H0("   --lookahead-threads <integer> Number of threads to be dedicated to perform lookahead only. Default %d\n", param->lookaheadThreads);
+        H0("-b/--bframes <0..16>             Maximum number of consecutive b-frames. Default %d\n", param->bframes);
+        H1("   --bframe-bias <integer>       Bias towards B frame decisions. Default %d\n", param->bFrameBias);
+        H0("   --b-adapt <0..2>              0 - none, 1 - fast, 2 - full (trellis) adaptive B frame scheduling. Default %d\n", param->bFrameAdaptive);
+        H0("   --[no-]b-pyramid              Use B-frames as references. Default %s\n", OPT(param->bBPyramid));
+        H1("   --qpfile <string>             Force frametypes and QPs for some or all frames\n");
+        H1("                                 Format of each line: framenumber frametype QP\n");
+        H1("                                 QP is optional (none lets x265 choose). Frametypes: I,i,K,P,B,b.\n");
+        H1("                                 QPs are restricted by qpmin/qpmax.\n");
+        H1("   --force-flush <integer>       Force the encoder to flush frames. Default %d\n", param->forceFlush);
+        H1("                                 0 - flush the encoder only when all the input pictures are over.\n");
+        H1("                                 1 - flush all the frames even when the input is not over. Slicetype decision may change with this option.\n");
+        H1("                                 2 - flush the slicetype decided frames only.\n");
​

x265_3.3.tar.gz/source/x265cli.h -> x265_3.4.tar.gz/source/x265cli.h Changed

@@ -27,9 +27,23 @@
 
 #include "common.h"
 #include "param.h"
+#include "input/input.h"
+#include "output/output.h"
+#include "output/reconplay.h"
 
 #include <getopt.h>
 
+#define CONSOLE_TITLE_SIZE 200
+#ifdef _WIN32
+#include <windows.h>
+#define SetThreadExecutionState(es)
+static char orgConsoleTitle[CONSOLE_TITLE_SIZE] = "";
+#else
+#define GetConsoleTitle(t, n)
+#define SetConsoleTitle(t)
+#define SetThreadExecutionState(es)
+#endif
+
 #ifdef __cplusplus
 namespace X265_NS {
 #endif
@@ -105,8 +119,8 @@
     { "amp",                  no_argument, NULL, 0 },
     { "no-early-skip",        no_argument, NULL, 0 },
     { "early-skip",           no_argument, NULL, 0 },
-    { "no-rskip",             no_argument, NULL, 0 },
-    { "rskip",                no_argument, NULL, 0 },
+    { "rskip",                required_argument, NULL, 0 },
+    { "rskip-edge-threshold", required_argument, NULL, 0 },
     { "no-fast-cbf",          no_argument, NULL, 0 },
     { "fast-cbf",             no_argument, NULL, 0 },
     { "no-tskip",             no_argument, NULL, 0 },
@@ -358,6 +372,7 @@
     { "cll", no_argument, NULL, 0 },
     { "no-cll", no_argument, NULL, 0 },
     { "hme-range", required_argument, NULL, 0 },
+    { "abr-ladder", required_argument, NULL, 0 },
     { 0, 0, 0, 0 },
     { 0, 0, 0, 0 },
     { 0, 0, 0, 0 },
@@ -365,336 +380,82 @@
     { 0, 0, 0, 0 }
 };
 
-static void printVersion(x265_param *param, const x265_api* api)
-{
-    x265_log(param, X265_LOG_INFO, "HEVC encoder version %s\n", api->version_str);
-    x265_log(param, X265_LOG_INFO, "build info %s\n", api->build_info_str);
-}
+    struct CLIOptions
+    {
+        InputFile* input;
+        ReconFile* recon;
+        OutputFile* output;
+        FILE*       qpfile;
+        FILE*       zoneFile;
+        FILE*    dolbyVisionRpu;    /* File containing Dolby Vision BL RPU metadata */
+        const char* reconPlayCmd;
+        const x265_api* api;
+        x265_param* param;
+        x265_vmaf_data* vmafData;
+        bool bProgress;
+        bool bForceY4m;
+        bool bDither;
+        uint32_t seek;              // number of frames to skip from the beginning
+        uint32_t framesToBeEncoded; // number of frames to encode
+        uint64_t totalbytes;
+        int64_t startTime;
+        int64_t prevUpdateTime;
 
-static void showHelp(x265_param *param)
-{
-    int level = param->logLevel;
+        int argCnt;
+        char** argString;
 
-#define OPT(value) (value ? "enabled" : "disabled")
-#define H0 printf
-#define H1 if (level >= X265_LOG_DEBUG) printf
+        /* ABR ladder settings */
+        bool isAbrLadderConfig;
+        bool enableScaler;
+        char*    encName;
+        char*    reuseName;
+        uint32_t encId;
+        int      refId;
+        uint32_t loadLevel;
+        uint32_t saveLevel;
+        uint32_t numRefs;
 
-    H0("\nSyntax: x265 [options] infile [-o] outfile\n");
-    H0("    infile can be YUV or Y4M\n");
-    H0("    outfile is raw HEVC bitstream\n");
-    H0("\nExecutable Options:\n");
-    H0("-h/--help                        Show this help text and exit\n");
-    H0("   --fullhelp                    Show all options and exit\n");
-    H0("-V/--version                     Show version info and exit\n");
-    H0("\nOutput Options:\n");
-    H0("-o/--output <filename>           Bitstream output file name\n");
-    H0("-D/--output-depth 8|10|12        Output bit depth (also internal bit depth). Default %d\n", param->internalBitDepth);
-    H0("   --log-level <string>          Logging level: none error warning info debug full. Default %s\n", X265_NS::logLevelNames[param->logLevel + 1]);
-    H0("   --no-progress                 Disable CLI progress reports\n");
-    H0("   --csv <filename>              Comma separated log file, if csv-log-level > 0 frame level statistics, else one line per run\n");
-    H0("   --csv-log-level <integer>     Level of csv logging, if csv-log-level > 0 frame level statistics, else one line per run: 0-2\n");
-    H0("\nInput Options:\n");
-    H0("   --input <filename>            Raw YUV or Y4M input file name. `-` for stdin\n");
-    H1("   --y4m                         Force parsing of input stream as YUV4MPEG2 regardless of file extension\n");
-    H0("   --fps <float|rational>        Source frame rate (float or num/denom), auto-detected if Y4M\n");
-    H0("   --input-res WxH               Source picture size [w x h], auto-detected if Y4M\n");
-    H1("   --input-depth <integer>       Bit-depth of input file. Default 8\n");
-    H1("   --input-csp <string>          Chroma subsampling, auto-detected if Y4M\n");
-    H1("                                 0 - i400 (4:0:0 monochrome)\n");
-    H1("                                 1 - i420 (4:2:0 default)\n");
-    H1("                                 2 - i422 (4:2:2)\n");
-    H1("                                 3 - i444 (4:4:4)\n");
-#if ENABLE_HDR10_PLUS
-    H0("   --dhdr10-info <filename>      JSON file containing the Creative Intent Metadata to be encoded as Dynamic Tone Mapping\n");
-    H0("   --[no-]dhdr10-opt             Insert tone mapping SEI only for IDR frames and when the tone mapping information changes. Default disabled\n");
-#endif
-    H0("   --dolby-vision-profile <float|integer> Specifies Dolby Vision profile ID. Currently only profile 5, profile 8.1 and profile 8.2 enabled. Specified as '5' or '50'. Default 0 (disabled).\n");
-    H0("   --dolby-vision-rpu <filename> File containing Dolby Vision RPU metadata.\n"
-       "                                 If given, x265's Dolby Vision metadata parser will fill the RPU field of input pictures with the metadata read from the file. Default NULL(disabled).\n");
-    H0("   --nalu-file <filename>        Text file containing SEI messages in the following format : <POC><space><PREFIX><space><NAL UNIT TYPE>/<SEI TYPE><space><SEI Payload>\n");
-    H0("-f/--frames <integer>            Maximum number of frames to encode. Default all\n");
-    H0("   --seek <integer>              First frame to encode\n");
-    H1("   --[no-]interlace <bff|tff>    Indicate input pictures are interlace fields in temporal order. Default progressive\n");
-    H0("   --[no-]field                  Enable or disable field coding. Default %s\n", OPT( param->bField));
-    H1("   --dither                      Enable dither if downscaling to 8 bit pixels. Default disabled\n");
-    H0("   --[no-]copy-pic               Copy buffers of input picture in frame. Default %s\n", OPT(param->bCopyPicToFrame));
-    H0("\nQuality reporting metrics:\n");
-    H0("   --[no-]ssim                   Enable reporting SSIM metric scores. Default %s\n", OPT(param->bEnableSsim));
-    H0("   --[no-]psnr                   Enable reporting PSNR metric scores. Default %s\n", OPT(param->bEnablePsnr));
-    H0("\nProfile, Level, Tier:\n");
-    H0("-P/--profile <string>            Enforce an encode profile: main, main10, mainstillpicture\n");
-    H0("   --level-idc <integer|float>   Force a minimum required decoder level (as '5.0' or '50')\n");
-    H0("   --[no-]high-tier              If a decoder level is specified, this modifier selects High tier of that level\n");
-    H0("   --uhd-bd                      Enable UHD Bluray compatibility support\n");
-    H0("   --[no-]allow-non-conformance  Allow the encoder to generate profile NONE bitstreams. Default %s\n", OPT(param->bAllowNonConformance));
-    H0("\nThreading, performance:\n");
-    H0("   --pools <integer,...>         Comma separated thread count per thread pool (pool per NUMA node)\n");
-    H0("                                 '-' implies no threads on node, '+' implies one thread per core on node\n");
-    H0("-F/--frame-threads <integer>     Number of concurrently encoded frames. 0: auto-determined by core count\n");
-    H0("   --[no-]wpp                    Enable Wavefront Parallel Processing. Default %s\n", OPT(param->bEnableWavefront));
-    H0("   --[no-]slices <integer>       Enable Multiple Slices feature. Default %d\n", param->maxSlices);
-    H0("   --[no-]pmode                  Parallel mode analysis. Default %s\n", OPT(param->bDistributeModeAnalysis));
-    H0("   --[no-]pme                    Parallel motion estimation. Default %s\n", OPT(param->bDistributeMotionEstimation));
-    H0("   --[no-]asm <bool|int|string>  Override CPU detection. Default: auto\n");
-    H0("\nPresets:\n");
-    H0("-p/--preset <string>             Trade off performance for compression efficiency. Default medium\n");
-    H0("                                 ultrafast, superfast, veryfast, faster, fast, medium, slow, slower, veryslow, or placebo\n");
-    H0("-t/--tune <string>               Tune the settings for a particular type of source or situation:\n");
-    H0("                                 psnr, ssim, grain, zerolatency, fastdecode\n");
-    H0("\nQuad-Tree size and depth:\n");
-    H0("-s/--ctu <64|32|16>              Maximum CU size (WxH). Default %d\n", param->maxCUSize);
-    H0("   --min-cu-size <64|32|16|8>    Minimum CU size (WxH). Default %d\n", param->minCUSize);
-    H0("   --max-tu-size <32|16|8|4>     Maximum TU size (WxH). Default %d\n", param->maxTUSize);
-    H0("   --tu-intra-depth <integer>    Max TU recursive depth for intra CUs. Default %d\n", param->tuQTMaxIntraDepth);
-    H0("   --tu-inter-depth <integer>    Max TU recursive depth for inter CUs. Default %d\n", param->tuQTMaxInterDepth);
-    H0("   --limit-tu <0..4>             Enable early exit from TU recursion for inter coded blocks. Default %d\n", param->limitTU);
-    H0("\nAnalysis:\n");
-    H0("   --rd <1..6>                   Level of RDO in mode decision 1:least....6:full RDO. Default %d\n", param->rdLevel);
-    H0("   --[no-]psy-rd <0..5.0>        Strength of psycho-visual rate distortion optimization, 0 to disable. Default %.1f\n", param->psyRd);
-    H0("   --[no-]rdoq-level <0|1|2>     Level of RDO in quantization 0:none, 1:levels, 2:levels & coding groups. Default %d\n", param->rdoqLevel);
-    H0("   --[no-]psy-rdoq <0..50.0>     Strength of psycho-visual optimization in RDO quantization, 0 to disable. Default %.1f\n", param->psyRdoq);
-    H0("   --dynamic-rd <0..4.0>         Strength of dynamic RD, 0 to disable. Default %.2f\n", param->dynamicRd);
-    H0("   --[no-]ssim-rd                Enable ssim rate distortion optimization, 0 to disable. Default %s\n", OPT(param->bSsimRd));
-    H0("   --[no-]rd-refine              Enable QP based RD refinement for rd levels 5 and 6. Default %s\n", OPT(param->bEnableRdRefine));
-    H0("   --[no-]early-skip             Enable early SKIP detection. Default %s\n", OPT(param->bEnableEarlySkip));
-    H0("   --[no-]rskip                  Enable early exit from recursion. Default %s\n", OPT(param->bEnableRecursionSkip));
-    H1("   --[no-]tskip-fast             Enable fast intra transform skipping. Default %s\n", OPT(param->bEnableTSkipFast));
-    H1("   --[no-]splitrd-skip           Enable skipping split RD analysis when sum of split CU rdCost larger than one split CU rdCost for Intra CU. Default %s\n", OPT(param->bEnableSplitRdSkip));
-    H1("   --nr-intra <integer>          An integer value in range of 0 to 2000, which denotes strength of noise reduction in intra CUs. Default 0\n");
-    H1("   --nr-inter <integer>          An integer value in range of 0 to 2000, which denotes strength of noise reduction in inter CUs. Default 0\n");
-    H0("   --ctu-info <integer>          Enable receiving ctu information asynchronously and determine reaction to the CTU information (0, 1, 2, 4, 6) Default 0\n"
-       "                                    - 1: force the partitions if CTU information is present\n"
-       "                                    - 2: functionality of (1) and reduce qp if CTU information has changed\n"
-       "                                    - 4: functionality of (1) and force Inter modes when CTU Information has changed, merge/skip otherwise\n"
-       "                                    Enable this option only when planning to invoke the API function x265_encoder_ctu_info to copy ctu-info asynchronously\n");
-    H0("\nCoding tools:\n");
-    H0("-w/--[no-]weightp                Enable weighted prediction in P slices. Default %s\n", OPT(param->bEnableWeightedPred));
-    H0("   --[no-]weightb                Enable weighted prediction in B slices. Default %s\n", OPT(param->bEnableWeightedBiPred));
-    H0("   --[no-]cu-lossless            Consider lossless mode in CU RDO decisions. Default %s\n", OPT(param->bCULossless));
-    H0("   --[no-]signhide               Hide sign bit of one coeff per TU (rdo). Default %s\n", OPT(param->bEnableSignHiding));
-    H1("   --[no-]tskip                  Enable intra 4x4 transform skipping. Default %s\n", OPT(param->bEnableTransformSkip));
-    H0("\nTemporal / motion search options:\n");
-    H0("   --max-merge <1..5>            Maximum number of merge candidates. Default %d\n", param->maxNumMergeCand);
-    H0("   --ref <integer>               max number of L0 references to be allowed (1 .. 16) Default %d\n", param->maxNumReferences);
-    H0("   --limit-refs <0|1|2|3>        Limit references per depth (1) or CU (2) or both (3). Default %d\n", param->limitReferences);
-    H0("   --me <string>                 Motion search method dia hex umh star full. Default %d\n", param->searchMethod);
-    H0("-m/--subme <integer>             Amount of subpel refinement to perform (0:least .. 7:most). Default %d \n", param->subpelRefine);
-    H0("   --merange <integer>           Motion search range. Default %d\n", param->searchRange);
-    H0("   --[no-]rect                   Enable rectangular motion partitions Nx2N and 2NxN. Default %s\n", OPT(param->bEnableRectInter));
-    H0("   --[no-]amp                    Enable asymmetric motion partitions, requires --rect. Default %s\n", OPT(param->bEnableAMP));
-    H0("   --[no-]limit-modes            Limit rectangular and asymmetric motion predictions. Default %d\n", param->limitModes);
-    H1("   --[no-]temporal-mvp           Enable temporal MV predictors. Default %s\n", OPT(param->bEnableTemporalMvp));
-    H1("   --[no-]hme                    Enable Hierarchical Motion Estimation. Default %s\n", OPT(param->bEnableHME));
-    H1("   --hme-search <string>         Motion search-method for HME L0,L1 and L2. Default(L0,L1,L2) is %d,%d,%d\n", param->hmeSearchMethod[0], param->hmeSearchMethod[1], param->hmeSearchMethod[2]);

 
@@ -27,9 +27,23 @@
 
 #include "common.h"
 #include "param.h"
+#include "input/input.h"
+#include "output/output.h"
+#include "output/reconplay.h"
 
 #include <getopt.h>
 
+#define CONSOLE_TITLE_SIZE 200
+#ifdef _WIN32
+#include <windows.h>
+#define SetThreadExecutionState(es)
+static char orgConsoleTitle[CONSOLE_TITLE_SIZE] = "";
+#else
+#define GetConsoleTitle(t, n)
+#define SetConsoleTitle(t)
+#define SetThreadExecutionState(es)
+#endif
+
 #ifdef __cplusplus
 namespace X265_NS {
 #endif
@@ -105,8 +119,8 @@
     { "amp",                  no_argument, NULL, 0 },
     { "no-early-skip",        no_argument, NULL, 0 },
     { "early-skip",           no_argument, NULL, 0 },
-    { "no-rskip",             no_argument, NULL, 0 },
-    { "rskip",                no_argument, NULL, 0 },
+    { "rskip",                required_argument, NULL, 0 },
+    { "rskip-edge-threshold", required_argument, NULL, 0 },
     { "no-fast-cbf",          no_argument, NULL, 0 },
     { "fast-cbf",             no_argument, NULL, 0 },
     { "no-tskip",             no_argument, NULL, 0 },
@@ -358,6 +372,7 @@
     { "cll", no_argument, NULL, 0 },
     { "no-cll", no_argument, NULL, 0 },
     { "hme-range", required_argument, NULL, 0 },
+    { "abr-ladder", required_argument, NULL, 0 },
     { 0, 0, 0, 0 },
     { 0, 0, 0, 0 },
     { 0, 0, 0, 0 },
@@ -365,336 +380,82 @@
     { 0, 0, 0, 0 }
 };
 
-static void printVersion(x265_param *param, const x265_api* api)
-{
-    x265_log(param, X265_LOG_INFO, "HEVC encoder version %s\n", api->version_str);
-    x265_log(param, X265_LOG_INFO, "build info %s\n", api->build_info_str);
-}
+    struct CLIOptions
+    {
+        InputFile* input;
+        ReconFile* recon;
+        OutputFile* output;
+        FILE*       qpfile;
+        FILE*       zoneFile;
+        FILE*    dolbyVisionRpu;    /* File containing Dolby Vision BL RPU metadata */
+        const char* reconPlayCmd;
+        const x265_api* api;
+        x265_param* param;
+        x265_vmaf_data* vmafData;
+        bool bProgress;
+        bool bForceY4m;
+        bool bDither;
+        uint32_t seek;              // number of frames to skip from the beginning
+        uint32_t framesToBeEncoded; // number of frames to encode
+        uint64_t totalbytes;
+        int64_t startTime;
+        int64_t prevUpdateTime;
 
-static void showHelp(x265_param *param)
-{
-    int level = param->logLevel;
+        int argCnt;
+        char** argString;
 
-#define OPT(value) (value ? "enabled" : "disabled")
-#define H0 printf
-#define H1 if (level >= X265_LOG_DEBUG) printf
+        /* ABR ladder settings */
+        bool isAbrLadderConfig;
+        bool enableScaler;
+        char*    encName;
+        char*    reuseName;
+        uint32_t encId;
+        int      refId;
+        uint32_t loadLevel;
+        uint32_t saveLevel;
+        uint32_t numRefs;
 
-    H0("\nSyntax: x265 [options] infile [-o] outfile\n");
-    H0("    infile can be YUV or Y4M\n");
-    H0("    outfile is raw HEVC bitstream\n");
-    H0("\nExecutable Options:\n");
-    H0("-h/--help                        Show this help text and exit\n");
-    H0("   --fullhelp                    Show all options and exit\n");
-    H0("-V/--version                     Show version info and exit\n");
-    H0("\nOutput Options:\n");
-    H0("-o/--output <filename>           Bitstream output file name\n");
-    H0("-D/--output-depth 8|10|12        Output bit depth (also internal bit depth). Default %d\n", param->internalBitDepth);
-    H0("   --log-level <string>          Logging level: none error warning info debug full. Default %s\n", X265_NS::logLevelNames[param->logLevel + 1]);
-    H0("   --no-progress                 Disable CLI progress reports\n");
-    H0("   --csv <filename>              Comma separated log file, if csv-log-level > 0 frame level statistics, else one line per run\n");
-    H0("   --csv-log-level <integer>     Level of csv logging, if csv-log-level > 0 frame level statistics, else one line per run: 0-2\n");
-    H0("\nInput Options:\n");
-    H0("   --input <filename>            Raw YUV or Y4M input file name. `-` for stdin\n");
-    H1("   --y4m                         Force parsing of input stream as YUV4MPEG2 regardless of file extension\n");
-    H0("   --fps <float|rational>        Source frame rate (float or num/denom), auto-detected if Y4M\n");
-    H0("   --input-res WxH               Source picture size [w x h], auto-detected if Y4M\n");
-    H1("   --input-depth <integer>       Bit-depth of input file. Default 8\n");
-    H1("   --input-csp <string>          Chroma subsampling, auto-detected if Y4M\n");
-    H1("                                 0 - i400 (4:0:0 monochrome)\n");
-    H1("                                 1 - i420 (4:2:0 default)\n");
-    H1("                                 2 - i422 (4:2:2)\n");
-    H1("                                 3 - i444 (4:4:4)\n");
-#if ENABLE_HDR10_PLUS
-    H0("   --dhdr10-info <filename>      JSON file containing the Creative Intent Metadata to be encoded as Dynamic Tone Mapping\n");
-    H0("   --[no-]dhdr10-opt             Insert tone mapping SEI only for IDR frames and when the tone mapping information changes. Default disabled\n");
-#endif
-    H0("   --dolby-vision-profile <float|integer> Specifies Dolby Vision profile ID. Currently only profile 5, profile 8.1 and profile 8.2 enabled. Specified as '5' or '50'. Default 0 (disabled).\n");
-    H0("   --dolby-vision-rpu <filename> File containing Dolby Vision RPU metadata.\n"
-       "                                 If given, x265's Dolby Vision metadata parser will fill the RPU field of input pictures with the metadata read from the file. Default NULL(disabled).\n");
-    H0("   --nalu-file <filename>        Text file containing SEI messages in the following format : <POC><space><PREFIX><space><NAL UNIT TYPE>/<SEI TYPE><space><SEI Payload>\n");
-    H0("-f/--frames <integer>            Maximum number of frames to encode. Default all\n");
-    H0("   --seek <integer>              First frame to encode\n");
-    H1("   --[no-]interlace <bff|tff>    Indicate input pictures are interlace fields in temporal order. Default progressive\n");
-    H0("   --[no-]field                  Enable or disable field coding. Default %s\n", OPT( param->bField));
-    H1("   --dither                      Enable dither if downscaling to 8 bit pixels. Default disabled\n");
-    H0("   --[no-]copy-pic               Copy buffers of input picture in frame. Default %s\n", OPT(param->bCopyPicToFrame));
-    H0("\nQuality reporting metrics:\n");
-    H0("   --[no-]ssim                   Enable reporting SSIM metric scores. Default %s\n", OPT(param->bEnableSsim));
-    H0("   --[no-]psnr                   Enable reporting PSNR metric scores. Default %s\n", OPT(param->bEnablePsnr));
-    H0("\nProfile, Level, Tier:\n");
-    H0("-P/--profile <string>            Enforce an encode profile: main, main10, mainstillpicture\n");
-    H0("   --level-idc <integer|float>   Force a minimum required decoder level (as '5.0' or '50')\n");
-    H0("   --[no-]high-tier              If a decoder level is specified, this modifier selects High tier of that level\n");
-    H0("   --uhd-bd                      Enable UHD Bluray compatibility support\n");
-    H0("   --[no-]allow-non-conformance  Allow the encoder to generate profile NONE bitstreams. Default %s\n", OPT(param->bAllowNonConformance));
-    H0("\nThreading, performance:\n");
-    H0("   --pools <integer,...>         Comma separated thread count per thread pool (pool per NUMA node)\n");
-    H0("                                 '-' implies no threads on node, '+' implies one thread per core on node\n");
-    H0("-F/--frame-threads <integer>     Number of concurrently encoded frames. 0: auto-determined by core count\n");
-    H0("   --[no-]wpp                    Enable Wavefront Parallel Processing. Default %s\n", OPT(param->bEnableWavefront));
-    H0("   --[no-]slices <integer>       Enable Multiple Slices feature. Default %d\n", param->maxSlices);
-    H0("   --[no-]pmode                  Parallel mode analysis. Default %s\n", OPT(param->bDistributeModeAnalysis));
-    H0("   --[no-]pme                    Parallel motion estimation. Default %s\n", OPT(param->bDistributeMotionEstimation));
-    H0("   --[no-]asm <bool|int|string>  Override CPU detection. Default: auto\n");
-    H0("\nPresets:\n");
-    H0("-p/--preset <string>             Trade off performance for compression efficiency. Default medium\n");
-    H0("                                 ultrafast, superfast, veryfast, faster, fast, medium, slow, slower, veryslow, or placebo\n");
-    H0("-t/--tune <string>               Tune the settings for a particular type of source or situation:\n");
-    H0("                                 psnr, ssim, grain, zerolatency, fastdecode\n");
-    H0("\nQuad-Tree size and depth:\n");
-    H0("-s/--ctu <64|32|16>              Maximum CU size (WxH). Default %d\n", param->maxCUSize);
-    H0("   --min-cu-size <64|32|16|8>    Minimum CU size (WxH). Default %d\n", param->minCUSize);
-    H0("   --max-tu-size <32|16|8|4>     Maximum TU size (WxH). Default %d\n", param->maxTUSize);
-    H0("   --tu-intra-depth <integer>    Max TU recursive depth for intra CUs. Default %d\n", param->tuQTMaxIntraDepth);
-    H0("   --tu-inter-depth <integer>    Max TU recursive depth for inter CUs. Default %d\n", param->tuQTMaxInterDepth);
-    H0("   --limit-tu <0..4>             Enable early exit from TU recursion for inter coded blocks. Default %d\n", param->limitTU);
-    H0("\nAnalysis:\n");
-    H0("   --rd <1..6>                   Level of RDO in mode decision 1:least....6:full RDO. Default %d\n", param->rdLevel);
-    H0("   --[no-]psy-rd <0..5.0>        Strength of psycho-visual rate distortion optimization, 0 to disable. Default %.1f\n", param->psyRd);
-    H0("   --[no-]rdoq-level <0|1|2>     Level of RDO in quantization 0:none, 1:levels, 2:levels & coding groups. Default %d\n", param->rdoqLevel);
-    H0("   --[no-]psy-rdoq <0..50.0>     Strength of psycho-visual optimization in RDO quantization, 0 to disable. Default %.1f\n", param->psyRdoq);
-    H0("   --dynamic-rd <0..4.0>         Strength of dynamic RD, 0 to disable. Default %.2f\n", param->dynamicRd);
-    H0("   --[no-]ssim-rd                Enable ssim rate distortion optimization, 0 to disable. Default %s\n", OPT(param->bSsimRd));
-    H0("   --[no-]rd-refine              Enable QP based RD refinement for rd levels 5 and 6. Default %s\n", OPT(param->bEnableRdRefine));
-    H0("   --[no-]early-skip             Enable early SKIP detection. Default %s\n", OPT(param->bEnableEarlySkip));
-    H0("   --[no-]rskip                  Enable early exit from recursion. Default %s\n", OPT(param->bEnableRecursionSkip));
-    H1("   --[no-]tskip-fast             Enable fast intra transform skipping. Default %s\n", OPT(param->bEnableTSkipFast));
-    H1("   --[no-]splitrd-skip           Enable skipping split RD analysis when sum of split CU rdCost larger than one split CU rdCost for Intra CU. Default %s\n", OPT(param->bEnableSplitRdSkip));
-    H1("   --nr-intra <integer>          An integer value in range of 0 to 2000, which denotes strength of noise reduction in intra CUs. Default 0\n");
-    H1("   --nr-inter <integer>          An integer value in range of 0 to 2000, which denotes strength of noise reduction in inter CUs. Default 0\n");
-    H0("   --ctu-info <integer>          Enable receiving ctu information asynchronously and determine reaction to the CTU information (0, 1, 2, 4, 6) Default 0\n"
-       "                                    - 1: force the partitions if CTU information is present\n"
-       "                                    - 2: functionality of (1) and reduce qp if CTU information has changed\n"
-       "                                    - 4: functionality of (1) and force Inter modes when CTU Information has changed, merge/skip otherwise\n"
-       "                                    Enable this option only when planning to invoke the API function x265_encoder_ctu_info to copy ctu-info asynchronously\n");
-    H0("\nCoding tools:\n");
-    H0("-w/--[no-]weightp                Enable weighted prediction in P slices. Default %s\n", OPT(param->bEnableWeightedPred));
-    H0("   --[no-]weightb                Enable weighted prediction in B slices. Default %s\n", OPT(param->bEnableWeightedBiPred));
-    H0("   --[no-]cu-lossless            Consider lossless mode in CU RDO decisions. Default %s\n", OPT(param->bCULossless));
-    H0("   --[no-]signhide               Hide sign bit of one coeff per TU (rdo). Default %s\n", OPT(param->bEnableSignHiding));
-    H1("   --[no-]tskip                  Enable intra 4x4 transform skipping. Default %s\n", OPT(param->bEnableTransformSkip));
-    H0("\nTemporal / motion search options:\n");
-    H0("   --max-merge <1..5>            Maximum number of merge candidates. Default %d\n", param->maxNumMergeCand);
-    H0("   --ref <integer>               max number of L0 references to be allowed (1 .. 16) Default %d\n", param->maxNumReferences);
-    H0("   --limit-refs <0|1|2|3>        Limit references per depth (1) or CU (2) or both (3). Default %d\n", param->limitReferences);
-    H0("   --me <string>                 Motion search method dia hex umh star full. Default %d\n", param->searchMethod);
-    H0("-m/--subme <integer>             Amount of subpel refinement to perform (0:least .. 7:most). Default %d \n", param->subpelRefine);
-    H0("   --merange <integer>           Motion search range. Default %d\n", param->searchRange);
-    H0("   --[no-]rect                   Enable rectangular motion partitions Nx2N and 2NxN. Default %s\n", OPT(param->bEnableRectInter));
-    H0("   --[no-]amp                    Enable asymmetric motion partitions, requires --rect. Default %s\n", OPT(param->bEnableAMP));
-    H0("   --[no-]limit-modes            Limit rectangular and asymmetric motion predictions. Default %d\n", param->limitModes);
-    H1("   --[no-]temporal-mvp           Enable temporal MV predictors. Default %s\n", OPT(param->bEnableTemporalMvp));
-    H1("   --[no-]hme                    Enable Hierarchical Motion Estimation. Default %s\n", OPT(param->bEnableHME));
-    H1("   --hme-search <string>         Motion search-method for HME L0,L1 and L2. Default(L0,L1,L2) is %d,%d,%d\n", param->hmeSearchMethod[0], param->hmeSearchMethod[1], param->hmeSearchMethod[2]);
​